caching, index page, threading

main
ColumbusUtrigas 2022-08-20 02:01:46 +04:00
parent 804356a139
commit 23f06f73a2
2 changed files with 133 additions and 26 deletions

98
app.py
View File

@ -1,12 +1,94 @@
from flask import Flask, Response from flask import Flask, Response, abort
from parsers.nvidia import * from parsers.nvidia import NvidiaParser
import os, time, threading
class ParserData:
parser = None
rss = None
time = None
lock = threading.Lock()
def __init__(self, parser):
self.parser = parser
app = Flask(__name__) app = Flask(__name__)
@app.route("/") parsers = [ParserData(NvidiaParser())]
def index():
return "<p>Hello, World!</p>" def getCachePath(parser):
path = './_cache/' + parser.__class__.__name__
return path
def checkParserCache(parser):
path = getCachePath(parser.parser)
try:
os.path.getmtime(path)
with open(path, 'r') as f:
parser.rss = f.read()
except:
os.makedirs(os.path.dirname(path), exist_ok=True)
open(path, 'w').close()
return True
filetime = os.path.getmtime(path)
currtime = time.time()
parser.time = filetime
return (currtime - filetime) > parser.parser.CACHE_TIMEOUT
def updateParserAndCache(parser):
rss = parser.parser.getRss()
with parser.lock:
parser.rss = rss
parser.time = time.time()
with open(getCachePath(parser.parser), 'w') as f:
f.write(parser.rss.decode('utf-8'))
def runParserWorker(parser):
if checkParserCache(parser):
updateParserAndCache(parser)
while True:
nextmark = parser.time + parser.parser.CACHE_TIMEOUT
sleepfor = nextmark - time.time()
if sleepfor > 0:
time.sleep(nextmark - time.time())
updateParserAndCache(parser)
def runParserWorkers():
for parser in parsers:
threading.Thread(target=runParserWorker, args=(parser,)).start()
indexPage = """
<html>
<head>
<title>RSS index page</title>
</head>
<body>
<h1>RSS index page</h1>
<h3>Custom generators of RSS feed from different blogs</h3>
<br/>
"""
for parser in parsers:
indexPage += f'<a href={parser.parser.URL}>{parser.parser.NAME}</a>'
indexPage += '</body>'
indexPage += '</html>'
@app.route('/')
def index():
return indexPage
runParserWorkers()
for parser in parsers:
@app.route(parser.parser.URL)
def query():
with parser.lock:
if parser.rss == None:
abort(404)
return Response(parser.rss, mimetype='text/xml')
@app.route("/nvidia")
def rss_query():
return Response(parseNvidia(), mimetype='text/xml')

View File

@ -1,27 +1,52 @@
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from feedgen.feed import FeedGenerator from feedgen.feed import FeedGenerator
from collections import namedtuple
import multiprocessing
from datetime import datetime
def parseNvidia(): Entry = namedtuple('Entry', 'url fe')
root_url = 'https://research.nvidia.com'
d = pq(root_url +'/publications')
fg = FeedGenerator() class NvidiaParser:
fg.id(root_url) NAME = 'NVidia Research'
fg.title('NVidia Research') URL = '/nvidia'
fg.link(href=root_url, rel='alternate') CACHE_TIMEOUT = 3600
fg.logo(root_url + '/favicon.ico')
fg.description('NVidia Research papers')
for elem in d('.views-field-title').items(): def parseNvidiaDate(entry):
link = elem.find('a') dom = pq(entry.url)
url = root_url + link.attr.href print(entry.url)
title = link.text()
fe = fg.add_entry() time = dom('.field--name-field-publication-date').find('time').attr.datetime
fe.id(url) time = datetime.strptime(time, '%Y-%m-%dT%H:%M:%S%z')
fe.title(title) entry.fe.pubDate(time)
fe.link(href=url)
return fg.rss_str() def getRss(self):
root_url = 'https://research.nvidia.com'
d = pq(root_url +'/publications')
fg = FeedGenerator()
fg.id(root_url)
fg.title('NVidia Research')
fg.link(href=root_url, rel='alternate')
fg.logo(root_url + '/themes/custom/nvidia/favicon.ico')
fg.description('NVidia Research papers')
entries = []
print('RSS GOT')
for elem in d('.views-field-title').items():
link = elem.find('a')
url = root_url + link.attr.href
title = link.text()
fe = fg.add_entry()
fe.id(url)
fe.title(title)
fe.link(href=url)
entries.append(Entry(url, fe))
with multiprocessing.Pool(8) as p:
p.map(NvidiaParser.parseNvidiaDate, entries)
return fg.rss_str()