diff --git a/.gitignore b/.gitignore index d6b8423..f9fec48 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ _cache/ -*__pycache__/ +rssparser diff --git a/README.md b/README.md index e6ac167..997ac06 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,2 @@ # RSS Parsers -## Required PiP modules -- flask -- pyquery -- feedgen diff --git a/app.go b/app.go new file mode 100644 index 0000000..87c1806 --- /dev/null +++ b/app.go @@ -0,0 +1,166 @@ +package main + +import ( + "fmt" + "log" + "net/http" + "os" + "path/filepath" + "sync" + "time" + + "github.com/gorilla/feeds" +) + +type RssEntry struct { + title string + url string + descripiton string + time time.Time +} + +type Parser interface { + Parse() []RssEntry + + Title() string + Description() string + RootUrl() string + ServerUrl() string + CacheName() string + CacheTimeout() int +} + +type ParserFeed struct { + parser Parser + rss string + mutex *sync.Mutex + time time.Time +} + +func (this ParserFeed) CachePath() string { + return "./_cache/" + this.parser.CacheName() + ".rss" +} + +func (this *ParserFeed) ReadCache() bool { + path := this.CachePath() + stat, err := os.Stat(path) + if err != nil { + return true + } + + data, err := os.ReadFile(this.CachePath()) + if err != nil { + return true + } + + this.SetFeed(string(data)) + this.time = stat.ModTime() + + return time.Now().Sub(this.time).Seconds() > float64(this.parser.CacheTimeout()) +} + +func (this *ParserFeed) ParseAndUpdateCache() { + feed := &feeds.Feed{ + Title: this.parser.Title(), + Link: &feeds.Link{Href: this.parser.RootUrl()}, + Description: this.parser.Description(), + Created: time.Now(), + } + + entities := this.parser.Parse() + + for _, re := range entities { + feed.Items = append(feed.Items, &feeds.Item{ + Id: re.url, + Title: re.title, + Link: &feeds.Link{Href: re.url}, + Description: re.title, + Created: time.Now(), + }) + } + + rssFeed, err := feed.ToRss() + if err != nil { + fmt.Println(err) + } + + dir := filepath.Dir(this.CachePath()) + os.MkdirAll(dir, os.ModePerm) + os.WriteFile(this.CachePath(), []byte(rssFeed), os.ModePerm) + + this.mutex.Lock() + this.rss = rssFeed + this.time = time.Now() + this.mutex.Unlock() +} + +func (this *ParserFeed) RunWorker() { + if this.ReadCache() { + this.ParseAndUpdateCache() + } + + for { + nextmark := this.time.Add(time.Second * time.Duration(this.parser.CacheTimeout())) + sleepfor := nextmark.Unix() - time.Now().Unix() + fmt.Println(sleepfor) + + if sleepfor > 0 { + time.Sleep(time.Second * time.Duration(sleepfor)) + } + this.ParseAndUpdateCache() + fmt.Println("update") + } +} + +func (this *ParserFeed) SetFeed(feed string) { + this.mutex.Lock() + this.rss = feed + this.mutex.Unlock() +} + +func (this *ParserFeed) GetFeed() string { + this.mutex.Lock() + rss := this.rss + this.mutex.Unlock() + + return rss +} + +func main() { + parsers := []*ParserFeed{ + {parser: NvidiaParser{}, mutex: &sync.Mutex{}}, + } + + for _, p := range parsers { + go p.RunWorker() + } + + rootPage := + ` + RSS index page + + + +

RSS index page

+

Custom generators of RSS feed from different blogs

+
` + + for _, p := range parsers { + rootPage += fmt.Sprintf("%v", p.parser.ServerUrl(), p.parser.Title()) + } + + rootPage += "" + + http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintf(w, rootPage) + }) + + for _, p := range parsers { + http.HandleFunc(p.parser.ServerUrl(), func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/xml") + fmt.Fprint(w, p.GetFeed()) + }) + } + + log.Fatal(http.ListenAndServe(":8081", nil)) +} diff --git a/app.py b/app.py deleted file mode 100644 index ca15f0c..0000000 --- a/app.py +++ /dev/null @@ -1,107 +0,0 @@ -from flask import Flask, Response, abort, request -from parsers.nvidia import NvidiaParser -from urllib.parse import urlparse -import os, time, threading - -class ParserData: - parser = None - rss = None - time = None - lock = threading.Lock() - - def __init__(self, parser): - self.parser = parser - -app = Flask(__name__) - -parsers = [ParserData(NvidiaParser())] - -def getCachePath(parser): - path = './_cache/' + parser.__class__.__name__ - return path - -def checkParserCache(parser): - path = getCachePath(parser.parser) - - try: - os.path.getmtime(path) - with open(path, 'r') as f: - parser.rss = f.read() - except: - os.makedirs(os.path.dirname(path), exist_ok=True) - open(path, 'w').close() - return True - - filetime = os.path.getmtime(path) - currtime = time.time() - parser.time = filetime - - return (currtime - filetime) > parser.parser.CACHE_TIMEOUT - -def updateParserAndCache(parser): - rss = parser.parser.getRss() - - with parser.lock: - parser.rss = rss - parser.time = time.time() - - with open(getCachePath(parser.parser), 'w') as f: - f.write(parser.rss.decode('utf-8')) - -def runParserWorker(parser): - if checkParserCache(parser): - updateParserAndCache(parser) - - while True: - nextmark = parser.time + parser.parser.CACHE_TIMEOUT - sleepfor = nextmark - time.time() - if sleepfor > 0: - time.sleep(nextmark - time.time()) - updateParserAndCache(parser) - -def runParserWorkers(): - for parser in parsers: - threading.Thread(target=runParserWorker, args=(parser,)).start() - -indexPage = """ - - -RSS index page - - -

RSS index page

-

Custom generators of RSS feed from different blogs

-
-""" - -for parser in parsers: - indexPage += f'{parser.parser.NAME}' -indexPage += '' -indexPage += '' - -@app.route('/') -def index(): - return indexPage - -#@app.route('/favicon.ico') -#def favicon(): -# referrer = request.referrer -# if referrer != None: -# u = urlparse(referrer) -# -# for parser in parsers: -# if parser.parser.URL == u.path: -# favi = parser.parser.favicon -# return Response(favi.content, mimetype=favi.headers['Content-Type']) -# -# abort(404) - -runParserWorkers() - -for parser in parsers: - @app.route(parser.parser.URL) - def query(): - with parser.lock: - if parser.rss == None: - abort(404) - return Response(parser.rss, mimetype='text/xml') diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..0357e52 --- /dev/null +++ b/go.mod @@ -0,0 +1,10 @@ +module rssparser + +go 1.19 + +require ( + github.com/PuerkitoBio/goquery v1.8.0 // indirect + github.com/andybalholm/cascadia v1.3.1 // indirect + github.com/gorilla/feeds v1.1.1 // indirect + golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..a07a53d --- /dev/null +++ b/go.sum @@ -0,0 +1,13 @@ +github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= +github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/gorilla/feeds v1.1.1 h1:HwKXxqzcRNg9to+BbvJog4+f3s/xzvtZXICcQGutYfY= +github.com/gorilla/feeds v1.1.1/go.mod h1:Nk0jZrvPFZX1OBe5NPiddPw7CfwF6Q9eqzaBbaightA= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 h1:/6y1LfuqNuQdHAm0jjtPtgRcxIxjVZgm5OTu8/QhZvk= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/nvidia.go b/nvidia.go new file mode 100644 index 0000000..3cd3cf9 --- /dev/null +++ b/nvidia.go @@ -0,0 +1,77 @@ +package main + +import ( + "fmt" + "net/http" + "time" + + "github.com/PuerkitoBio/goquery" +) + +type NvidiaParser struct { +} + +func (this NvidiaParser) Parse() []RssEntry { + var result []RssEntry + + resp, err := http.Get(this.RootUrl() + "/publications") + if err != nil { + } + + doc, err := goquery.NewDocumentFromResponse(resp) + + // parse articles and links + doc.Find(".views-field-title").Each(func(i int, s *goquery.Selection) { + link := s.Find("a") + href, exists := link.Attr("href") + if exists { + result = append(result, RssEntry{title: link.Text(), url: this.RootUrl() + href}) + } + }) + + // parse times + for _, re := range result { + resp, err := http.Get(re.url) + if err != nil { + } + + doc, err := goquery.NewDocumentFromResponse(resp) + re.descripiton = doc.Find(".field--type-text-with-summary").Text() + + date, exists := doc.Find(".field--name-field-publication-date").Find("time").Attr("datetime") + if exists { + t, err := time.Parse(time.RFC3339, date) + if err == nil { + re.time = t + } + } + + fmt.Println(re.url) + } + + return result +} + +func (this NvidiaParser) Title() string { + return "NVidia Research" +} + +func (this NvidiaParser) Description() string { + return "NVidia Research papers" +} + +func (this NvidiaParser) RootUrl() string { + return "https://research.nvidia.com" +} + +func (this NvidiaParser) ServerUrl() string { + return "/nvidia" +} + +func (this NvidiaParser) CacheName() string { + return "NvidiaParser" +} + +func (this NvidiaParser) CacheTimeout() int { + return 7200 +} diff --git a/parsers/nvidia.py b/parsers/nvidia.py deleted file mode 100644 index 556b996..0000000 --- a/parsers/nvidia.py +++ /dev/null @@ -1,67 +0,0 @@ -from pyquery import PyQuery as pq -from feedgen.feed import FeedGenerator -from collections import namedtuple -import multiprocessing -from datetime import datetime -import requests - -Entry = namedtuple('Entry', 'url fe') - -class NvidiaParser: - NAME = 'NVidia Research' - URL = '/nvidia' - CACHE_TIMEOUT = 3600 - root_url = 'https://research.nvidia.com' - favicon = None - - def loadFavicon(self): - try: - favUrl = NvidiaParser.root_url + '/themes/custom/nvidia/favicon.ico' - self.favicon = requests.get(favUrl) - except: - pass - -# def __init__(self): -# self.loadFavicon() - - def parseNvidiaDate(entry): - dom = pq(entry.url) - print(entry.url) - - time = dom('.field--name-field-publication-date').find('time').attr.datetime - time = datetime.strptime(time, '%Y-%m-%dT%H:%M:%S%z') - entry.fe.pubDate(time) - - def getRss(self): - d = pq(self.root_url +'/publications') -# self.loadFavicon() - - fg = FeedGenerator() - fg.id(self.root_url) - fg.title('NVidia Research') - fg.link(href=self.root_url, rel='alternate') - fg.description('NVidia Research papers') - - entries = [] - - for elem in d('.views-field-title').items(): - link = elem.find('a') - url = self.root_url + link.attr.href - title = link.text() - - fe = fg.add_entry() - fe.id(url) - fe.title(title) - fe.link(href=url) - - entries.append(Entry(url, fe)) - - for entry in entries: - NvidiaParser.parseNvidiaDate(entry) - print(entry.url) - -# with multiprocessing.Pool(8) as p: -# p.map(NvidiaParser.parseNvidiaDate, entries) - - return fg.rss_str() -