rewritten in go for perfomance

2022-08-21 07:53:21 +04:00 · 2022-08-21 07:53:21 +04:00 · 366dd6375b
parent 9d474741a1
commit 366dd6375b
8 changed files with 267 additions and 179 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,2 @@
 _cache/
-*__pycache__/
+rssparser
--- a/README.md
+++ b/README.md
@ -1,6 +1,2 @@
 # RSS Parsers
 ## Required PiP modules
 - flask
 - pyquery
 - feedgen
--- a/app.go
+++ b/app.go
@ -0,0 +1,166 @@
 package main
 import (
 	"fmt"
 	"log"
 	"net/http"
 	"os"
 	"path/filepath"
 	"sync"
 	"time"
 	"github.com/gorilla/feeds"
 )
 type RssEntry struct {
 	title       string
 	url         string
 	descripiton string
 	time        time.Time
 }
 type Parser interface {
 	Parse() []RssEntry
 	Title() string
 	Description() string
 	RootUrl() string
 	ServerUrl() string
 	CacheName() string
 	CacheTimeout() int
 }
 type ParserFeed struct {
 	parser Parser
 	rss    string
 	mutex  *sync.Mutex
 	time   time.Time
 }
 func (this ParserFeed) CachePath() string {
 	return "./_cache/" + this.parser.CacheName() + ".rss"
 }
 func (this *ParserFeed) ReadCache() bool {
 	path := this.CachePath()
 	stat, err := os.Stat(path)
 	if err != nil {
 		return true
 	}
 	data, err := os.ReadFile(this.CachePath())
 	if err != nil {
 		return true
 	}
 	this.SetFeed(string(data))
 	this.time = stat.ModTime()
 	return time.Now().Sub(this.time).Seconds() > float64(this.parser.CacheTimeout())
 }
 func (this *ParserFeed) ParseAndUpdateCache() {
 	feed := &feeds.Feed{
 		Title:       this.parser.Title(),
 		Link:        &feeds.Link{Href: this.parser.RootUrl()},
 		Description: this.parser.Description(),
 		Created:     time.Now(),
 	}
 	entities := this.parser.Parse()
 	for _, re := range entities {
 		feed.Items = append(feed.Items, &feeds.Item{
 			Id:          re.url,
 			Title:       re.title,
 			Link:        &feeds.Link{Href: re.url},
 			Description: re.title,
 			Created:     time.Now(),
 		})
 	}
 	rssFeed, err := feed.ToRss()
 	if err != nil {
 		fmt.Println(err)
 	}
 	dir := filepath.Dir(this.CachePath())
 	os.MkdirAll(dir, os.ModePerm)
 	os.WriteFile(this.CachePath(), []byte(rssFeed), os.ModePerm)
 	this.mutex.Lock()
 	this.rss = rssFeed
 	this.time = time.Now()
 	this.mutex.Unlock()
 }
 func (this *ParserFeed) RunWorker() {
 	if this.ReadCache() {
 		this.ParseAndUpdateCache()
 	}
 	for {
 		nextmark := this.time.Add(time.Second * time.Duration(this.parser.CacheTimeout()))
 		sleepfor := nextmark.Unix() - time.Now().Unix()
 		fmt.Println(sleepfor)
 		if sleepfor > 0 {
 			time.Sleep(time.Second * time.Duration(sleepfor))
 		}
 		this.ParseAndUpdateCache()
 		fmt.Println("update")
 	}
 }
 func (this *ParserFeed) SetFeed(feed string) {
 	this.mutex.Lock()
 	this.rss = feed
 	this.mutex.Unlock()
 }
 func (this *ParserFeed) GetFeed() string {
 	this.mutex.Lock()
 	rss := this.rss
 	this.mutex.Unlock()
 	return rss
 }
 func main() {
 	parsers := []*ParserFeed{
 		{parser: NvidiaParser{}, mutex: &sync.Mutex{}},
 	}
 	for _, p := range parsers {
 		go p.RunWorker()
 	}
 	rootPage :=
 		`<!DOCTYPE html> <html><head>
 		<title>RSS index page</title>
 		<meta charset="utf-8">
 		</head>
 		<body>
 		<h1>RSS index page</h1>
 		<h3>Custom generators of RSS feed from different blogs</h3>
 		<br/>`
 	for _, p := range parsers {
 		rootPage += fmt.Sprintf("<a href=%v>%v</a>", p.parser.ServerUrl(), p.parser.Title())
 	}
 	rootPage += "</body></html>"
 	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
 		fmt.Fprintf(w, rootPage)
 	})
 	for _, p := range parsers {
 		http.HandleFunc(p.parser.ServerUrl(), func(w http.ResponseWriter, r *http.Request) {
 			w.Header().Set("Content-Type", "text/xml")
 			fmt.Fprint(w, p.GetFeed())
 		})
 	}
 	log.Fatal(http.ListenAndServe(":8081", nil))
 }
--- a/app.py
+++ b/app.py
@ -1,107 +0,0 @@
 from flask import Flask, Response, abort, request
 from parsers.nvidia import NvidiaParser
 from urllib.parse import urlparse
 import os, time, threading
 class ParserData:
    parser = None
    rss = None
    time = None
    lock = threading.Lock()
    def __init__(self, parser):
        self.parser = parser
 app = Flask(__name__)
 parsers = [ParserData(NvidiaParser())]
 def getCachePath(parser):
    path = './_cache/' + parser.__class__.__name__
    return path
 def checkParserCache(parser):
    path = getCachePath(parser.parser)
    try:
        os.path.getmtime(path)
        with open(path, 'r') as f:
            parser.rss = f.read()
    except:
        os.makedirs(os.path.dirname(path), exist_ok=True)
        open(path, 'w').close()
        return True
    filetime = os.path.getmtime(path)
    currtime = time.time()
    parser.time = filetime
    return (currtime - filetime) > parser.parser.CACHE_TIMEOUT
 def updateParserAndCache(parser):
    rss = parser.parser.getRss()
    with parser.lock:
        parser.rss = rss
        parser.time = time.time()
    with open(getCachePath(parser.parser), 'w') as f:
        f.write(parser.rss.decode('utf-8'))
 def runParserWorker(parser):
    if checkParserCache(parser):
        updateParserAndCache(parser)
    while True:
        nextmark = parser.time + parser.parser.CACHE_TIMEOUT
        sleepfor = nextmark - time.time()
        if sleepfor > 0:
            time.sleep(nextmark - time.time())
        updateParserAndCache(parser)
 def runParserWorkers():
    for parser in parsers:
        threading.Thread(target=runParserWorker, args=(parser,)).start()
 indexPage = """
 <html>
 <head>
 <title>RSS index page</title>
 </head>
 <body>
 <h1>RSS index page</h1>
 <h3>Custom generators of RSS feed from different blogs</h3>
 <br/>
 """
 for parser in parsers:
    indexPage += f'<a href={parser.parser.URL}>{parser.parser.NAME}</a>'
 indexPage += '</body>'
 indexPage += '</html>'
@app.route('/')
 def index():
    return indexPage
 #@app.route('/favicon.ico')
 #def favicon():
 #    referrer = request.referrer
 #    if referrer != None:
 #        u = urlparse(referrer)
 #
 #        for parser in parsers:
 #            if parser.parser.URL == u.path:
 #                favi = parser.parser.favicon
 #                return Response(favi.content, mimetype=favi.headers['Content-Type'])
 #
 #    abort(404)
 runParserWorkers()
 for parser in parsers:
    @app.route(parser.parser.URL)
    def query():
        with parser.lock:
            if parser.rss == None:
                abort(404)
            return Response(parser.rss, mimetype='text/xml')
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,10 @@
 module rssparser
 go 1.19
 require (
 	github.com/PuerkitoBio/goquery v1.8.0 // indirect
 	github.com/andybalholm/cascadia v1.3.1 // indirect
 	github.com/gorilla/feeds v1.1.1 // indirect
 	golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 // indirect
 )
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,13 @@
 github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
 github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
 github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
 github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
 github.com/gorilla/feeds v1.1.1 h1:HwKXxqzcRNg9to+BbvJog4+f3s/xzvtZXICcQGutYfY=
 github.com/gorilla/feeds v1.1.1/go.mod h1:Nk0jZrvPFZX1OBe5NPiddPw7CfwF6Q9eqzaBbaightA=
 golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 h1:/6y1LfuqNuQdHAm0jjtPtgRcxIxjVZgm5OTu8/QhZvk=
 golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/nvidia.go
+++ b/nvidia.go
@ -0,0 +1,77 @@
 package main
 import (
 	"fmt"
 	"net/http"
 	"time"
 	"github.com/PuerkitoBio/goquery"
 )
 type NvidiaParser struct {
 }
 func (this NvidiaParser) Parse() []RssEntry {
 	var result []RssEntry
 	resp, err := http.Get(this.RootUrl() + "/publications")
 	if err != nil {
 	}
 	doc, err := goquery.NewDocumentFromResponse(resp)
 	// parse articles and links
 	doc.Find(".views-field-title").Each(func(i int, s *goquery.Selection) {
 		link := s.Find("a")
 		href, exists := link.Attr("href")
 		if exists {
 			result = append(result, RssEntry{title: link.Text(), url: this.RootUrl() + href})
 		}
 	})
 	// parse times
 	for _, re := range result {
 		resp, err := http.Get(re.url)
 		if err != nil {
 		}
 		doc, err := goquery.NewDocumentFromResponse(resp)
 		re.descripiton = doc.Find(".field--type-text-with-summary").Text()
 		date, exists := doc.Find(".field--name-field-publication-date").Find("time").Attr("datetime")
 		if exists {
 			t, err := time.Parse(time.RFC3339, date)
 			if err == nil {
 				re.time = t
 			}
 		}
 		fmt.Println(re.url)
 	}
 	return result
 }
 func (this NvidiaParser) Title() string {
 	return "NVidia Research"
 }
 func (this NvidiaParser) Description() string {
 	return "NVidia Research papers"
 }
 func (this NvidiaParser) RootUrl() string {
 	return "https://research.nvidia.com"
 }
 func (this NvidiaParser) ServerUrl() string {
 	return "/nvidia"
 }
 func (this NvidiaParser) CacheName() string {
 	return "NvidiaParser"
 }
 func (this NvidiaParser) CacheTimeout() int {
 	return 7200
 }
--- a/parsers/nvidia.py
+++ b/parsers/nvidia.py
@ -1,67 +0,0 @@
 from pyquery import PyQuery as pq
 from feedgen.feed import FeedGenerator
 from collections import namedtuple
 import multiprocessing
 from datetime import datetime
 import requests
 Entry = namedtuple('Entry', 'url fe')
 class NvidiaParser:
    NAME = 'NVidia Research'
    URL = '/nvidia'
    CACHE_TIMEOUT = 3600
    root_url = 'https://research.nvidia.com'
    favicon = None
    def loadFavicon(self):
        try:
            favUrl = NvidiaParser.root_url + '/themes/custom/nvidia/favicon.ico'
            self.favicon = requests.get(favUrl)
        except:
            pass
 #    def __init__(self):
 #        self.loadFavicon()
    def parseNvidiaDate(entry):
        dom = pq(entry.url)
        print(entry.url)
        time = dom('.field--name-field-publication-date').find('time').attr.datetime
        time = datetime.strptime(time, '%Y-%m-%dT%H:%M:%S%z')
        entry.fe.pubDate(time)
    def getRss(self):
        d = pq(self.root_url +'/publications')
 #        self.loadFavicon()
        fg = FeedGenerator()
        fg.id(self.root_url)
        fg.title('NVidia Research')
        fg.link(href=self.root_url, rel='alternate')
        fg.description('NVidia Research papers')
        entries = []
        for elem in d('.views-field-title').items():
            link = elem.find('a')
            url = self.root_url + link.attr.href
            title = link.text()
            fe = fg.add_entry()
            fe.id(url)
            fe.title(title)
            fe.link(href=url)
            entries.append(Entry(url, fe))
        for entry in entries:
            NvidiaParser.parseNvidiaDate(entry)
            print(entry.url)
 #        with multiprocessing.Pool(8) as p:
 #            p.map(NvidiaParser.parseNvidiaDate, entries)
        return fg.rss_str()
`@ -1,2 +1,2 @@`
	`_cache/`	`_cache/`
	`*__pycache__/`	`rssparser`