rewritten in go for perfomance

2022-08-21 07:53:21 +04:00 · 2022-08-21 07:53:21 +04:00 · 366dd6375b
parent 9d474741a1
commit 366dd6375b
8 changed files with 267 additions and 179 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,2 @@
 _cache/
-*__pycache__/
+rssparser
--- a/README.md
+++ b/README.md
@ -1,6 +1,2 @@
 # RSS Parsers

-## Required PiP modules
- flask
- pyquery
- feedgen
--- a/app.go
+++ b/app.go
@ -0,0 +1,166 @@
+package main
+
+import (
+	"fmt"
+	"log"
+	"net/http"
+	"os"
+	"path/filepath"
+	"sync"
+	"time"
+
+	"github.com/gorilla/feeds"
+)
+
+type RssEntry struct {
+	title       string
+	url         string
+	descripiton string
+	time        time.Time
+}
+
+type Parser interface {
+	Parse() []RssEntry
+
+	Title() string
+	Description() string
+	RootUrl() string
+	ServerUrl() string
+	CacheName() string
+	CacheTimeout() int
+}
+
+type ParserFeed struct {
+	parser Parser
+	rss    string
+	mutex  *sync.Mutex
+	time   time.Time
+}
+
+func (this ParserFeed) CachePath() string {
+	return "./_cache/" + this.parser.CacheName() + ".rss"
+}
+
+func (this *ParserFeed) ReadCache() bool {
+	path := this.CachePath()
+	stat, err := os.Stat(path)
+	if err != nil {
+		return true
+	}
+
+	data, err := os.ReadFile(this.CachePath())
+	if err != nil {
+		return true
+	}
+
+	this.SetFeed(string(data))
+	this.time = stat.ModTime()
+
+	return time.Now().Sub(this.time).Seconds() > float64(this.parser.CacheTimeout())
+}
+
+func (this *ParserFeed) ParseAndUpdateCache() {
+	feed := &feeds.Feed{
+		Title:       this.parser.Title(),
+		Link:        &feeds.Link{Href: this.parser.RootUrl()},
+		Description: this.parser.Description(),
+		Created:     time.Now(),
+	}
+
+	entities := this.parser.Parse()
+
+	for _, re := range entities {
+		feed.Items = append(feed.Items, &feeds.Item{
+			Id:          re.url,
+			Title:       re.title,
+			Link:        &feeds.Link{Href: re.url},
+			Description: re.title,
+			Created:     time.Now(),
+		})
+	}
+
+	rssFeed, err := feed.ToRss()
+	if err != nil {
+		fmt.Println(err)
+	}
+
+	dir := filepath.Dir(this.CachePath())
+	os.MkdirAll(dir, os.ModePerm)
+	os.WriteFile(this.CachePath(), []byte(rssFeed), os.ModePerm)
+
+	this.mutex.Lock()
+	this.rss = rssFeed
+	this.time = time.Now()
+	this.mutex.Unlock()
+}
+
+func (this *ParserFeed) RunWorker() {
+	if this.ReadCache() {
+		this.ParseAndUpdateCache()
+	}
+
+	for {
+		nextmark := this.time.Add(time.Second * time.Duration(this.parser.CacheTimeout()))
+		sleepfor := nextmark.Unix() - time.Now().Unix()
+		fmt.Println(sleepfor)
+
+		if sleepfor > 0 {
+			time.Sleep(time.Second * time.Duration(sleepfor))
+		}
+		this.ParseAndUpdateCache()
+		fmt.Println("update")
+	}
+}
+
+func (this *ParserFeed) SetFeed(feed string) {
+	this.mutex.Lock()
+	this.rss = feed
+	this.mutex.Unlock()
+}
+
+func (this *ParserFeed) GetFeed() string {
+	this.mutex.Lock()
+	rss := this.rss
+	this.mutex.Unlock()
+
+	return rss
+}
+
+func main() {
+	parsers := []*ParserFeed{
+		{parser: NvidiaParser{}, mutex: &sync.Mutex{}},
+	}
+
+	for _, p := range parsers {
+		go p.RunWorker()
+	}
+
+	rootPage :=
+		`<!DOCTYPE html> <html><head>
+		<title>RSS index page</title>
+		<meta charset="utf-8">
+		</head>
+		<body>
+		<h1>RSS index page</h1>
+		<h3>Custom generators of RSS feed from different blogs</h3>
+		<br/>`
+
+	for _, p := range parsers {
+		rootPage += fmt.Sprintf("<a href=%v>%v</a>", p.parser.ServerUrl(), p.parser.Title())
+	}
+
+	rootPage += "</body></html>"
+
+	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
+		fmt.Fprintf(w, rootPage)
+	})
+
+	for _, p := range parsers {
+		http.HandleFunc(p.parser.ServerUrl(), func(w http.ResponseWriter, r *http.Request) {
+			w.Header().Set("Content-Type", "text/xml")
+			fmt.Fprint(w, p.GetFeed())
+		})
+	}
+
+	log.Fatal(http.ListenAndServe(":8081", nil))
+}
--- a/app.py
+++ b/app.py
@ -1,107 +0,0 @@
-from flask import Flask, Response, abort, request
-from parsers.nvidia import NvidiaParser
-from urllib.parse import urlparse
-import os, time, threading
-
-class ParserData:
-    parser = None
-    rss = None
-    time = None
-    lock = threading.Lock()
-
-    def __init__(self, parser):
-        self.parser = parser
-
-app = Flask(__name__)
-
-parsers = [ParserData(NvidiaParser())]
-
-def getCachePath(parser):
-    path = './_cache/' + parser.__class__.__name__
-    return path
-
-def checkParserCache(parser):
-    path = getCachePath(parser.parser)
-
-    try:
-        os.path.getmtime(path)
-        with open(path, 'r') as f:
-            parser.rss = f.read()
-    except:
-        os.makedirs(os.path.dirname(path), exist_ok=True)
-        open(path, 'w').close()
-        return True
-
-    filetime = os.path.getmtime(path)
-    currtime = time.time()
-    parser.time = filetime
-
-    return (currtime - filetime) > parser.parser.CACHE_TIMEOUT
-
-def updateParserAndCache(parser):
-    rss = parser.parser.getRss()
-
-    with parser.lock:
-        parser.rss = rss
-        parser.time = time.time()
-
-    with open(getCachePath(parser.parser), 'w') as f:
-        f.write(parser.rss.decode('utf-8'))
-
-def runParserWorker(parser):
-    if checkParserCache(parser):
-        updateParserAndCache(parser)
-
-    while True:
-        nextmark = parser.time + parser.parser.CACHE_TIMEOUT
-        sleepfor = nextmark - time.time()
-        if sleepfor > 0:
-            time.sleep(nextmark - time.time())
-        updateParserAndCache(parser)
-
-def runParserWorkers():
-    for parser in parsers:
-        threading.Thread(target=runParserWorker, args=(parser,)).start()
-
-indexPage = """
-<html>
-<head>
-<title>RSS index page</title>
-</head>
-<body>
-<h1>RSS index page</h1>
-<h3>Custom generators of RSS feed from different blogs</h3>
-<br/>
-"""
-
-for parser in parsers:
-    indexPage += f'<a href={parser.parser.URL}>{parser.parser.NAME}</a>'
-indexPage += '</body>'
-indexPage += '</html>'
-
-@app.route('/')
-def index():
-    return indexPage
-
-#@app.route('/favicon.ico')
-#def favicon():
-#    referrer = request.referrer
-#    if referrer != None:
-#        u = urlparse(referrer)
-#
-#        for parser in parsers:
-#            if parser.parser.URL == u.path:
-#                favi = parser.parser.favicon
-#                return Response(favi.content, mimetype=favi.headers['Content-Type'])
-#
-#    abort(404)
-
-runParserWorkers()
-
-for parser in parsers:
-    @app.route(parser.parser.URL)
-    def query():
-        with parser.lock:
-            if parser.rss == None:
-                abort(404)
-            return Response(parser.rss, mimetype='text/xml')
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,10 @@
+module rssparser
+
+go 1.19
+
+require (
+	github.com/PuerkitoBio/goquery v1.8.0 // indirect
+	github.com/andybalholm/cascadia v1.3.1 // indirect
+	github.com/gorilla/feeds v1.1.1 // indirect
+	golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 // indirect
+)
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,13 @@
+github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
+github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
+github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
+github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
+github.com/gorilla/feeds v1.1.1 h1:HwKXxqzcRNg9to+BbvJog4+f3s/xzvtZXICcQGutYfY=
+github.com/gorilla/feeds v1.1.1/go.mod h1:Nk0jZrvPFZX1OBe5NPiddPw7CfwF6Q9eqzaBbaightA=
+golang.org/x/net v0.0.0-20210916014120-12bc252f5db8 h1:/6y1LfuqNuQdHAm0jjtPtgRcxIxjVZgm5OTu8/QhZvk=
+golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/nvidia.go
+++ b/nvidia.go
@ -0,0 +1,77 @@
+package main
+
+import (
+	"fmt"
+	"net/http"
+	"time"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+type NvidiaParser struct {
+}
+
+func (this NvidiaParser) Parse() []RssEntry {
+	var result []RssEntry
+
+	resp, err := http.Get(this.RootUrl() + "/publications")
+	if err != nil {
+	}
+
+	doc, err := goquery.NewDocumentFromResponse(resp)
+
+	// parse articles and links
+	doc.Find(".views-field-title").Each(func(i int, s *goquery.Selection) {
+		link := s.Find("a")
+		href, exists := link.Attr("href")
+		if exists {
+			result = append(result, RssEntry{title: link.Text(), url: this.RootUrl() + href})
+		}
+	})
+
+	// parse times
+	for _, re := range result {
+		resp, err := http.Get(re.url)
+		if err != nil {
+		}
+
+		doc, err := goquery.NewDocumentFromResponse(resp)
+		re.descripiton = doc.Find(".field--type-text-with-summary").Text()
+
+		date, exists := doc.Find(".field--name-field-publication-date").Find("time").Attr("datetime")
+		if exists {
+			t, err := time.Parse(time.RFC3339, date)
+			if err == nil {
+				re.time = t
+			}
+		}
+
+		fmt.Println(re.url)
+	}
+
+	return result
+}
+
+func (this NvidiaParser) Title() string {
+	return "NVidia Research"
+}
+
+func (this NvidiaParser) Description() string {
+	return "NVidia Research papers"
+}
+
+func (this NvidiaParser) RootUrl() string {
+	return "https://research.nvidia.com"
+}
+
+func (this NvidiaParser) ServerUrl() string {
+	return "/nvidia"
+}
+
+func (this NvidiaParser) CacheName() string {
+	return "NvidiaParser"
+}
+
+func (this NvidiaParser) CacheTimeout() int {
+	return 7200
+}
--- a/parsers/nvidia.py
+++ b/parsers/nvidia.py
@ -1,67 +0,0 @@
-from pyquery import PyQuery as pq
-from feedgen.feed import FeedGenerator
-from collections import namedtuple
-import multiprocessing
-from datetime import datetime
-import requests
-
-Entry = namedtuple('Entry', 'url fe')
-
-class NvidiaParser:
-    NAME = 'NVidia Research'
-    URL = '/nvidia'
-    CACHE_TIMEOUT = 3600
-    root_url = 'https://research.nvidia.com'
-    favicon = None
-
-    def loadFavicon(self):
-        try:
-            favUrl = NvidiaParser.root_url + '/themes/custom/nvidia/favicon.ico'
-            self.favicon = requests.get(favUrl)
-        except:
-            pass
-
-#    def __init__(self):
-#        self.loadFavicon()
-
-    def parseNvidiaDate(entry):
-        dom = pq(entry.url)
-        print(entry.url)
-
-        time = dom('.field--name-field-publication-date').find('time').attr.datetime
-        time = datetime.strptime(time, '%Y-%m-%dT%H:%M:%S%z')
-        entry.fe.pubDate(time)
-
-    def getRss(self):
-        d = pq(self.root_url +'/publications')
-#        self.loadFavicon()
-
-        fg = FeedGenerator()
-        fg.id(self.root_url)
-        fg.title('NVidia Research')
-        fg.link(href=self.root_url, rel='alternate')
-        fg.description('NVidia Research papers')
-
-        entries = []
-
-        for elem in d('.views-field-title').items():
-            link = elem.find('a')
-            url = self.root_url + link.attr.href
-            title = link.text()
-
-            fe = fg.add_entry()
-            fe.id(url)
-            fe.title(title)
-            fe.link(href=url)
-
-            entries.append(Entry(url, fe))
-
-        for entry in entries:
-            NvidiaParser.parseNvidiaDate(entry)
-            print(entry.url)
-
-#        with multiprocessing.Pool(8) as p:
-#            p.map(NvidiaParser.parseNvidiaDate, entries)
-
-        return fg.rss_str()
-