Compare commits

..

No commits in common. "daca611f6afc9b7ae106f1bef2fea14fa2a44d1f" and "804356a1393acc8ad27c856829979a2cc4a9663a" have entirely different histories.

3 changed files with 25 additions and 138 deletions

View File

@ -1,6 +0,0 @@
# RSS Parsers
## Required PiP modules
- flask
- pyquery
- feedgen

96
app.py
View File

@ -1,94 +1,12 @@
from flask import Flask, Response, abort from flask import Flask, Response
from parsers.nvidia import NvidiaParser from parsers.nvidia import *
import os, time, threading
class ParserData:
parser = None
rss = None
time = None
lock = threading.Lock()
def __init__(self, parser):
self.parser = parser
app = Flask(__name__) app = Flask(__name__)
parsers = [ParserData(NvidiaParser())] @app.route("/")
def getCachePath(parser):
path = './_cache/' + parser.__class__.__name__
return path
def checkParserCache(parser):
path = getCachePath(parser.parser)
try:
os.path.getmtime(path)
with open(path, 'r') as f:
parser.rss = f.read()
except:
os.makedirs(os.path.dirname(path), exist_ok=True)
open(path, 'w').close()
return True
filetime = os.path.getmtime(path)
currtime = time.time()
parser.time = filetime
return (currtime - filetime) > parser.parser.CACHE_TIMEOUT
def updateParserAndCache(parser):
rss = parser.parser.getRss()
with parser.lock:
parser.rss = rss
parser.time = time.time()
with open(getCachePath(parser.parser), 'w') as f:
f.write(parser.rss.decode('utf-8'))
def runParserWorker(parser):
if checkParserCache(parser):
updateParserAndCache(parser)
while True:
nextmark = parser.time + parser.parser.CACHE_TIMEOUT
sleepfor = nextmark - time.time()
if sleepfor > 0:
time.sleep(nextmark - time.time())
updateParserAndCache(parser)
def runParserWorkers():
for parser in parsers:
threading.Thread(target=runParserWorker, args=(parser,)).start()
indexPage = """
<html>
<head>
<title>RSS index page</title>
</head>
<body>
<h1>RSS index page</h1>
<h3>Custom generators of RSS feed from different blogs</h3>
<br/>
"""
for parser in parsers:
indexPage += f'<a href={parser.parser.URL}>{parser.parser.NAME}</a>'
indexPage += '</body>'
indexPage += '</html>'
@app.route('/')
def index(): def index():
return indexPage return "<p>Hello, World!</p>"
runParserWorkers()
for parser in parsers:
@app.route(parser.parser.URL)
def query():
with parser.lock:
if parser.rss == None:
abort(404)
return Response(parser.rss, mimetype='text/xml')
@app.route("/nvidia")
def rss_query():
return Response(parseNvidia(), mimetype='text/xml')

View File

@ -1,52 +1,27 @@
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from feedgen.feed import FeedGenerator from feedgen.feed import FeedGenerator
from collections import namedtuple
import multiprocessing
from datetime import datetime
Entry = namedtuple('Entry', 'url fe') def parseNvidia():
root_url = 'https://research.nvidia.com'
d = pq(root_url +'/publications')
class NvidiaParser: fg = FeedGenerator()
NAME = 'NVidia Research' fg.id(root_url)
URL = '/nvidia' fg.title('NVidia Research')
CACHE_TIMEOUT = 3600 fg.link(href=root_url, rel='alternate')
fg.logo(root_url + '/favicon.ico')
fg.description('NVidia Research papers')
def parseNvidiaDate(entry): for elem in d('.views-field-title').items():
dom = pq(entry.url) link = elem.find('a')
print(entry.url) url = root_url + link.attr.href
title = link.text()
time = dom('.field--name-field-publication-date').find('time').attr.datetime fe = fg.add_entry()
time = datetime.strptime(time, '%Y-%m-%dT%H:%M:%S%z') fe.id(url)
entry.fe.pubDate(time) fe.title(title)
fe.link(href=url)
def getRss(self): return fg.rss_str()
root_url = 'https://research.nvidia.com'
d = pq(root_url +'/publications')
fg = FeedGenerator()
fg.id(root_url)
fg.title('NVidia Research')
fg.link(href=root_url, rel='alternate')
fg.logo(root_url + '/themes/custom/nvidia/favicon.ico')
fg.description('NVidia Research papers')
entries = []
print('RSS GOT')
for elem in d('.views-field-title').items():
link = elem.find('a')
url = root_url + link.attr.href
title = link.text()
fe = fg.add_entry()
fe.id(url)
fe.title(title)
fe.link(href=url)
entries.append(Entry(url, fe))
with multiprocessing.Pool(8) as p:
p.map(NvidiaParser.parseNvidiaDate, entries)
return fg.rss_str()