rewritten in go for perfomance

main
ColumbusUtrigas 2022-08-21 07:53:21 +04:00
parent 9d474741a1
commit 366dd6375b
8 changed files with 267 additions and 179 deletions

2
.gitignore vendored
View File

@ -1,2 +1,2 @@
_cache/
*__pycache__/
rssparser

View File

@ -1,6 +1,2 @@
# RSS Parsers
## Required PiP modules
- flask
- pyquery
- feedgen

166
app.go Normal file
View File

@ -0,0 +1,166 @@
package main
import (
"fmt"
"log"
"net/http"
"os"
"path/filepath"
"sync"
"time"
"github.com/gorilla/feeds"
)
type RssEntry struct {
title string
url string
descripiton string
time time.Time
}
type Parser interface {
Parse() []RssEntry
Title() string
Description() string
RootUrl() string
ServerUrl() string
CacheName() string
CacheTimeout() int
}
type ParserFeed struct {
parser Parser
rss string
mutex *sync.Mutex
time time.Time
}
func (this ParserFeed) CachePath() string {
return "./_cache/" + this.parser.CacheName() + ".rss"
}
func (this *ParserFeed) ReadCache() bool {
path := this.CachePath()
stat, err := os.Stat(path)
if err != nil {
return true
}
data, err := os.ReadFile(this.CachePath())
if err != nil {
return true
}
this.SetFeed(string(data))
this.time = stat.ModTime()
return time.Now().Sub(this.time).Seconds() > float64(this.parser.CacheTimeout())
}
func (this *ParserFeed) ParseAndUpdateCache() {
feed := &feeds.Feed{
Title: this.parser.Title(),
Link: &feeds.Link{Href: this.parser.RootUrl()},
Description: this.parser.Description(),
Created: time.Now(),
}
entities := this.parser.Parse()
for _, re := range entities {
feed.Items = append(feed.Items, &feeds.Item{
Id: re.url,
Title: re.title,
Link: &feeds.Link{Href: re.url},
Description: re.title,
Created: time.Now(),
})
}
rssFeed, err := feed.ToRss()
if err != nil {
fmt.Println(err)
}
dir := filepath.Dir(this.CachePath())
os.MkdirAll(dir, os.ModePerm)
os.WriteFile(this.CachePath(), []byte(rssFeed), os.ModePerm)
this.mutex.Lock()
this.rss = rssFeed
this.time = time.Now()
this.mutex.Unlock()
}
func (this *ParserFeed) RunWorker() {
if this.ReadCache() {
this.ParseAndUpdateCache()
}
for {
nextmark := this.time.Add(time.Second * time.Duration(this.parser.CacheTimeout()))
sleepfor := nextmark.Unix() - time.Now().Unix()
fmt.Println(sleepfor)
if sleepfor > 0 {
time.Sleep(time.Second * time.Duration(sleepfor))
}
this.ParseAndUpdateCache()
fmt.Println("update")
}
}
func (this *ParserFeed) SetFeed(feed string) {
this.mutex.Lock()
this.rss = feed
this.mutex.Unlock()
}
func (this *ParserFeed) GetFeed() string {
this.mutex.Lock()
rss := this.rss
this.mutex.Unlock()
return rss
}
func main() {
parsers := []*ParserFeed{
{parser: NvidiaParser{}, mutex: &sync.Mutex{}},
}
for _, p := range parsers {
go p.RunWorker()
}
rootPage :=
`<!DOCTYPE html> <html><head>
<title>RSS index page</title>
<meta charset="utf-8">
</head>
<body>
<h1>RSS index page</h1>
<h3>Custom generators of RSS feed from different blogs</h3>
<br/>`
for _, p := range parsers {
rootPage += fmt.Sprintf("<a href=%v>%v</a>", p.parser.ServerUrl(), p.parser.Title())
}
rootPage += "</body></html>"
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
fmt.Fprintf(w, rootPage)
})
for _, p := range parsers {
http.HandleFunc(p.parser.ServerUrl(), func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/xml")
fmt.Fprint(w, p.GetFeed())
})
}
log.Fatal(http.ListenAndServe(":8081", nil))
}

107
app.py
View File

@ -1,107 +0,0 @@
from flask import Flask, Response, abort, request
from parsers.nvidia import NvidiaParser
from urllib.parse import urlparse
import os, time, threading
class ParserData:
parser = None
rss = None
time = None
lock = threading.Lock()
def __init__(self, parser):
self.parser = parser
app = Flask(__name__)
parsers = [ParserData(NvidiaParser())]
def getCachePath(parser):
path = './_cache/' + parser.__class__.__name__
return path
def checkParserCache(parser):
path = getCachePath(parser.parser)
try:
os.path.getmtime(path)
with open(path, 'r') as f:
parser.rss = f.read()
except:
os.makedirs(os.path.dirname(path), exist_ok=True)
open(path, 'w').close()
return True
filetime = os.path.getmtime(path)
currtime = time.time()
parser.time = filetime
return (currtime - filetime) > parser.parser.CACHE_TIMEOUT
def updateParserAndCache(parser):
rss = parser.parser.getRss()
with parser.lock:
parser.rss = rss
parser.time = time.time()
with open(getCachePath(parser.parser), 'w') as f:
f.write(parser.rss.decode('utf-8'))
def runParserWorker(parser):
if checkParserCache(parser):
updateParserAndCache(parser)
while True:
nextmark = parser.time + parser.parser.CACHE_TIMEOUT
sleepfor = nextmark - time.time()
if sleepfor > 0:
time.sleep(nextmark - time.time())
updateParserAndCache(parser)
def runParserWorkers():
for parser in parsers:
threading.Thread(target=runParserWorker, args=(parser,)).start()
indexPage = """
<html>
<head>
<title>RSS index page</title>
</head>
<body>
<h1>RSS index page</h1>
<h3>Custom generators of RSS feed from different blogs</h3>
<br/>
"""
for parser in parsers:
indexPage += f'<a href={parser.parser.URL}>{parser.parser.NAME}</a>'
indexPage += '</body>'
indexPage += '</html>'
@app.route('/')
def index():
return indexPage
#@app.route('/favicon.ico')
#def favicon():
# referrer = request.referrer
# if referrer != None:
# u = urlparse(referrer)
#
# for parser in parsers:
# if parser.parser.URL == u.path:
# favi = parser.parser.favicon
# return Response(favi.content, mimetype=favi.headers['Content-Type'])
#
# abort(404)
runParserWorkers()
for parser in parsers:
@app.route(parser.parser.URL)
def query():
with parser.lock:
if parser.rss == None:
abort(404)
return Response(parser.rss, mimetype='text/xml')

10
go.mod Normal file
View File

@ -0,0 +1,10 @@
module rssparser
go 1.19
require (
github.com/PuerkitoBio/goquery v1.8.0 // indirect
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/gorilla/feeds v1.1.1 // indirect
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 // indirect
)

13
go.sum Normal file
View File

@ -0,0 +1,13 @@
github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
github.com/gorilla/feeds v1.1.1 h1:HwKXxqzcRNg9to+BbvJog4+f3s/xzvtZXICcQGutYfY=
github.com/gorilla/feeds v1.1.1/go.mod h1:Nk0jZrvPFZX1OBe5NPiddPw7CfwF6Q9eqzaBbaightA=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 h1:/6y1LfuqNuQdHAm0jjtPtgRcxIxjVZgm5OTu8/QhZvk=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

77
nvidia.go Normal file
View File

@ -0,0 +1,77 @@
package main
import (
"fmt"
"net/http"
"time"
"github.com/PuerkitoBio/goquery"
)
type NvidiaParser struct {
}
func (this NvidiaParser) Parse() []RssEntry {
var result []RssEntry
resp, err := http.Get(this.RootUrl() + "/publications")
if err != nil {
}
doc, err := goquery.NewDocumentFromResponse(resp)
// parse articles and links
doc.Find(".views-field-title").Each(func(i int, s *goquery.Selection) {
link := s.Find("a")
href, exists := link.Attr("href")
if exists {
result = append(result, RssEntry{title: link.Text(), url: this.RootUrl() + href})
}
})
// parse times
for _, re := range result {
resp, err := http.Get(re.url)
if err != nil {
}
doc, err := goquery.NewDocumentFromResponse(resp)
re.descripiton = doc.Find(".field--type-text-with-summary").Text()
date, exists := doc.Find(".field--name-field-publication-date").Find("time").Attr("datetime")
if exists {
t, err := time.Parse(time.RFC3339, date)
if err == nil {
re.time = t
}
}
fmt.Println(re.url)
}
return result
}
func (this NvidiaParser) Title() string {
return "NVidia Research"
}
func (this NvidiaParser) Description() string {
return "NVidia Research papers"
}
func (this NvidiaParser) RootUrl() string {
return "https://research.nvidia.com"
}
func (this NvidiaParser) ServerUrl() string {
return "/nvidia"
}
func (this NvidiaParser) CacheName() string {
return "NvidiaParser"
}
func (this NvidiaParser) CacheTimeout() int {
return 7200
}

View File

@ -1,67 +0,0 @@
from pyquery import PyQuery as pq
from feedgen.feed import FeedGenerator
from collections import namedtuple
import multiprocessing
from datetime import datetime
import requests
Entry = namedtuple('Entry', 'url fe')
class NvidiaParser:
NAME = 'NVidia Research'
URL = '/nvidia'
CACHE_TIMEOUT = 3600
root_url = 'https://research.nvidia.com'
favicon = None
def loadFavicon(self):
try:
favUrl = NvidiaParser.root_url + '/themes/custom/nvidia/favicon.ico'
self.favicon = requests.get(favUrl)
except:
pass
# def __init__(self):
# self.loadFavicon()
def parseNvidiaDate(entry):
dom = pq(entry.url)
print(entry.url)
time = dom('.field--name-field-publication-date').find('time').attr.datetime
time = datetime.strptime(time, '%Y-%m-%dT%H:%M:%S%z')
entry.fe.pubDate(time)
def getRss(self):
d = pq(self.root_url +'/publications')
# self.loadFavicon()
fg = FeedGenerator()
fg.id(self.root_url)
fg.title('NVidia Research')
fg.link(href=self.root_url, rel='alternate')
fg.description('NVidia Research papers')
entries = []
for elem in d('.views-field-title').items():
link = elem.find('a')
url = self.root_url + link.attr.href
title = link.text()
fe = fg.add_entry()
fe.id(url)
fe.title(title)
fe.link(href=url)
entries.append(Entry(url, fe))
for entry in entries:
NvidiaParser.parseNvidiaDate(entry)
print(entry.url)
# with multiprocessing.Pool(8) as p:
# p.map(NvidiaParser.parseNvidiaDate, entries)
return fg.rss_str()