rewritten in go for perfomance
parent
9d474741a1
commit
366dd6375b
|
|
@ -1,2 +1,2 @@
|
|||
_cache/
|
||||
*__pycache__/
|
||||
rssparser
|
||||
|
|
|
|||
|
|
@ -1,6 +1,2 @@
|
|||
# RSS Parsers
|
||||
|
||||
## Required PiP modules
|
||||
- flask
|
||||
- pyquery
|
||||
- feedgen
|
||||
|
|
|
|||
|
|
@ -0,0 +1,166 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/feeds"
|
||||
)
|
||||
|
||||
type RssEntry struct {
|
||||
title string
|
||||
url string
|
||||
descripiton string
|
||||
time time.Time
|
||||
}
|
||||
|
||||
type Parser interface {
|
||||
Parse() []RssEntry
|
||||
|
||||
Title() string
|
||||
Description() string
|
||||
RootUrl() string
|
||||
ServerUrl() string
|
||||
CacheName() string
|
||||
CacheTimeout() int
|
||||
}
|
||||
|
||||
type ParserFeed struct {
|
||||
parser Parser
|
||||
rss string
|
||||
mutex *sync.Mutex
|
||||
time time.Time
|
||||
}
|
||||
|
||||
func (this ParserFeed) CachePath() string {
|
||||
return "./_cache/" + this.parser.CacheName() + ".rss"
|
||||
}
|
||||
|
||||
func (this *ParserFeed) ReadCache() bool {
|
||||
path := this.CachePath()
|
||||
stat, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return true
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(this.CachePath())
|
||||
if err != nil {
|
||||
return true
|
||||
}
|
||||
|
||||
this.SetFeed(string(data))
|
||||
this.time = stat.ModTime()
|
||||
|
||||
return time.Now().Sub(this.time).Seconds() > float64(this.parser.CacheTimeout())
|
||||
}
|
||||
|
||||
func (this *ParserFeed) ParseAndUpdateCache() {
|
||||
feed := &feeds.Feed{
|
||||
Title: this.parser.Title(),
|
||||
Link: &feeds.Link{Href: this.parser.RootUrl()},
|
||||
Description: this.parser.Description(),
|
||||
Created: time.Now(),
|
||||
}
|
||||
|
||||
entities := this.parser.Parse()
|
||||
|
||||
for _, re := range entities {
|
||||
feed.Items = append(feed.Items, &feeds.Item{
|
||||
Id: re.url,
|
||||
Title: re.title,
|
||||
Link: &feeds.Link{Href: re.url},
|
||||
Description: re.title,
|
||||
Created: time.Now(),
|
||||
})
|
||||
}
|
||||
|
||||
rssFeed, err := feed.ToRss()
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
|
||||
dir := filepath.Dir(this.CachePath())
|
||||
os.MkdirAll(dir, os.ModePerm)
|
||||
os.WriteFile(this.CachePath(), []byte(rssFeed), os.ModePerm)
|
||||
|
||||
this.mutex.Lock()
|
||||
this.rss = rssFeed
|
||||
this.time = time.Now()
|
||||
this.mutex.Unlock()
|
||||
}
|
||||
|
||||
func (this *ParserFeed) RunWorker() {
|
||||
if this.ReadCache() {
|
||||
this.ParseAndUpdateCache()
|
||||
}
|
||||
|
||||
for {
|
||||
nextmark := this.time.Add(time.Second * time.Duration(this.parser.CacheTimeout()))
|
||||
sleepfor := nextmark.Unix() - time.Now().Unix()
|
||||
fmt.Println(sleepfor)
|
||||
|
||||
if sleepfor > 0 {
|
||||
time.Sleep(time.Second * time.Duration(sleepfor))
|
||||
}
|
||||
this.ParseAndUpdateCache()
|
||||
fmt.Println("update")
|
||||
}
|
||||
}
|
||||
|
||||
func (this *ParserFeed) SetFeed(feed string) {
|
||||
this.mutex.Lock()
|
||||
this.rss = feed
|
||||
this.mutex.Unlock()
|
||||
}
|
||||
|
||||
func (this *ParserFeed) GetFeed() string {
|
||||
this.mutex.Lock()
|
||||
rss := this.rss
|
||||
this.mutex.Unlock()
|
||||
|
||||
return rss
|
||||
}
|
||||
|
||||
func main() {
|
||||
parsers := []*ParserFeed{
|
||||
{parser: NvidiaParser{}, mutex: &sync.Mutex{}},
|
||||
}
|
||||
|
||||
for _, p := range parsers {
|
||||
go p.RunWorker()
|
||||
}
|
||||
|
||||
rootPage :=
|
||||
`<!DOCTYPE html> <html><head>
|
||||
<title>RSS index page</title>
|
||||
<meta charset="utf-8">
|
||||
</head>
|
||||
<body>
|
||||
<h1>RSS index page</h1>
|
||||
<h3>Custom generators of RSS feed from different blogs</h3>
|
||||
<br/>`
|
||||
|
||||
for _, p := range parsers {
|
||||
rootPage += fmt.Sprintf("<a href=%v>%v</a>", p.parser.ServerUrl(), p.parser.Title())
|
||||
}
|
||||
|
||||
rootPage += "</body></html>"
|
||||
|
||||
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
|
||||
fmt.Fprintf(w, rootPage)
|
||||
})
|
||||
|
||||
for _, p := range parsers {
|
||||
http.HandleFunc(p.parser.ServerUrl(), func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/xml")
|
||||
fmt.Fprint(w, p.GetFeed())
|
||||
})
|
||||
}
|
||||
|
||||
log.Fatal(http.ListenAndServe(":8081", nil))
|
||||
}
|
||||
107
app.py
107
app.py
|
|
@ -1,107 +0,0 @@
|
|||
from flask import Flask, Response, abort, request
|
||||
from parsers.nvidia import NvidiaParser
|
||||
from urllib.parse import urlparse
|
||||
import os, time, threading
|
||||
|
||||
class ParserData:
|
||||
parser = None
|
||||
rss = None
|
||||
time = None
|
||||
lock = threading.Lock()
|
||||
|
||||
def __init__(self, parser):
|
||||
self.parser = parser
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
parsers = [ParserData(NvidiaParser())]
|
||||
|
||||
def getCachePath(parser):
|
||||
path = './_cache/' + parser.__class__.__name__
|
||||
return path
|
||||
|
||||
def checkParserCache(parser):
|
||||
path = getCachePath(parser.parser)
|
||||
|
||||
try:
|
||||
os.path.getmtime(path)
|
||||
with open(path, 'r') as f:
|
||||
parser.rss = f.read()
|
||||
except:
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
open(path, 'w').close()
|
||||
return True
|
||||
|
||||
filetime = os.path.getmtime(path)
|
||||
currtime = time.time()
|
||||
parser.time = filetime
|
||||
|
||||
return (currtime - filetime) > parser.parser.CACHE_TIMEOUT
|
||||
|
||||
def updateParserAndCache(parser):
|
||||
rss = parser.parser.getRss()
|
||||
|
||||
with parser.lock:
|
||||
parser.rss = rss
|
||||
parser.time = time.time()
|
||||
|
||||
with open(getCachePath(parser.parser), 'w') as f:
|
||||
f.write(parser.rss.decode('utf-8'))
|
||||
|
||||
def runParserWorker(parser):
|
||||
if checkParserCache(parser):
|
||||
updateParserAndCache(parser)
|
||||
|
||||
while True:
|
||||
nextmark = parser.time + parser.parser.CACHE_TIMEOUT
|
||||
sleepfor = nextmark - time.time()
|
||||
if sleepfor > 0:
|
||||
time.sleep(nextmark - time.time())
|
||||
updateParserAndCache(parser)
|
||||
|
||||
def runParserWorkers():
|
||||
for parser in parsers:
|
||||
threading.Thread(target=runParserWorker, args=(parser,)).start()
|
||||
|
||||
indexPage = """
|
||||
<html>
|
||||
<head>
|
||||
<title>RSS index page</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>RSS index page</h1>
|
||||
<h3>Custom generators of RSS feed from different blogs</h3>
|
||||
<br/>
|
||||
"""
|
||||
|
||||
for parser in parsers:
|
||||
indexPage += f'<a href={parser.parser.URL}>{parser.parser.NAME}</a>'
|
||||
indexPage += '</body>'
|
||||
indexPage += '</html>'
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
return indexPage
|
||||
|
||||
#@app.route('/favicon.ico')
|
||||
#def favicon():
|
||||
# referrer = request.referrer
|
||||
# if referrer != None:
|
||||
# u = urlparse(referrer)
|
||||
#
|
||||
# for parser in parsers:
|
||||
# if parser.parser.URL == u.path:
|
||||
# favi = parser.parser.favicon
|
||||
# return Response(favi.content, mimetype=favi.headers['Content-Type'])
|
||||
#
|
||||
# abort(404)
|
||||
|
||||
runParserWorkers()
|
||||
|
||||
for parser in parsers:
|
||||
@app.route(parser.parser.URL)
|
||||
def query():
|
||||
with parser.lock:
|
||||
if parser.rss == None:
|
||||
abort(404)
|
||||
return Response(parser.rss, mimetype='text/xml')
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
module rssparser
|
||||
|
||||
go 1.19
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.8.0 // indirect
|
||||
github.com/andybalholm/cascadia v1.3.1 // indirect
|
||||
github.com/gorilla/feeds v1.1.1 // indirect
|
||||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 // indirect
|
||||
)
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
|
||||
github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
|
||||
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
|
||||
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
|
||||
github.com/gorilla/feeds v1.1.1 h1:HwKXxqzcRNg9to+BbvJog4+f3s/xzvtZXICcQGutYfY=
|
||||
github.com/gorilla/feeds v1.1.1/go.mod h1:Nk0jZrvPFZX1OBe5NPiddPw7CfwF6Q9eqzaBbaightA=
|
||||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 h1:/6y1LfuqNuQdHAm0jjtPtgRcxIxjVZgm5OTu8/QhZvk=
|
||||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
type NvidiaParser struct {
|
||||
}
|
||||
|
||||
func (this NvidiaParser) Parse() []RssEntry {
|
||||
var result []RssEntry
|
||||
|
||||
resp, err := http.Get(this.RootUrl() + "/publications")
|
||||
if err != nil {
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromResponse(resp)
|
||||
|
||||
// parse articles and links
|
||||
doc.Find(".views-field-title").Each(func(i int, s *goquery.Selection) {
|
||||
link := s.Find("a")
|
||||
href, exists := link.Attr("href")
|
||||
if exists {
|
||||
result = append(result, RssEntry{title: link.Text(), url: this.RootUrl() + href})
|
||||
}
|
||||
})
|
||||
|
||||
// parse times
|
||||
for _, re := range result {
|
||||
resp, err := http.Get(re.url)
|
||||
if err != nil {
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromResponse(resp)
|
||||
re.descripiton = doc.Find(".field--type-text-with-summary").Text()
|
||||
|
||||
date, exists := doc.Find(".field--name-field-publication-date").Find("time").Attr("datetime")
|
||||
if exists {
|
||||
t, err := time.Parse(time.RFC3339, date)
|
||||
if err == nil {
|
||||
re.time = t
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println(re.url)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func (this NvidiaParser) Title() string {
|
||||
return "NVidia Research"
|
||||
}
|
||||
|
||||
func (this NvidiaParser) Description() string {
|
||||
return "NVidia Research papers"
|
||||
}
|
||||
|
||||
func (this NvidiaParser) RootUrl() string {
|
||||
return "https://research.nvidia.com"
|
||||
}
|
||||
|
||||
func (this NvidiaParser) ServerUrl() string {
|
||||
return "/nvidia"
|
||||
}
|
||||
|
||||
func (this NvidiaParser) CacheName() string {
|
||||
return "NvidiaParser"
|
||||
}
|
||||
|
||||
func (this NvidiaParser) CacheTimeout() int {
|
||||
return 7200
|
||||
}
|
||||
|
|
@ -1,67 +0,0 @@
|
|||
from pyquery import PyQuery as pq
|
||||
from feedgen.feed import FeedGenerator
|
||||
from collections import namedtuple
|
||||
import multiprocessing
|
||||
from datetime import datetime
|
||||
import requests
|
||||
|
||||
Entry = namedtuple('Entry', 'url fe')
|
||||
|
||||
class NvidiaParser:
|
||||
NAME = 'NVidia Research'
|
||||
URL = '/nvidia'
|
||||
CACHE_TIMEOUT = 3600
|
||||
root_url = 'https://research.nvidia.com'
|
||||
favicon = None
|
||||
|
||||
def loadFavicon(self):
|
||||
try:
|
||||
favUrl = NvidiaParser.root_url + '/themes/custom/nvidia/favicon.ico'
|
||||
self.favicon = requests.get(favUrl)
|
||||
except:
|
||||
pass
|
||||
|
||||
# def __init__(self):
|
||||
# self.loadFavicon()
|
||||
|
||||
def parseNvidiaDate(entry):
|
||||
dom = pq(entry.url)
|
||||
print(entry.url)
|
||||
|
||||
time = dom('.field--name-field-publication-date').find('time').attr.datetime
|
||||
time = datetime.strptime(time, '%Y-%m-%dT%H:%M:%S%z')
|
||||
entry.fe.pubDate(time)
|
||||
|
||||
def getRss(self):
|
||||
d = pq(self.root_url +'/publications')
|
||||
# self.loadFavicon()
|
||||
|
||||
fg = FeedGenerator()
|
||||
fg.id(self.root_url)
|
||||
fg.title('NVidia Research')
|
||||
fg.link(href=self.root_url, rel='alternate')
|
||||
fg.description('NVidia Research papers')
|
||||
|
||||
entries = []
|
||||
|
||||
for elem in d('.views-field-title').items():
|
||||
link = elem.find('a')
|
||||
url = self.root_url + link.attr.href
|
||||
title = link.text()
|
||||
|
||||
fe = fg.add_entry()
|
||||
fe.id(url)
|
||||
fe.title(title)
|
||||
fe.link(href=url)
|
||||
|
||||
entries.append(Entry(url, fe))
|
||||
|
||||
for entry in entries:
|
||||
NvidiaParser.parseNvidiaDate(entry)
|
||||
print(entry.url)
|
||||
|
||||
# with multiprocessing.Pool(8) as p:
|
||||
# p.map(NvidiaParser.parseNvidiaDate, entries)
|
||||
|
||||
return fg.rss_str()
|
||||
|
||||
Loading…
Reference in New Issue