caching, index page, threading

main
ColumbusUtrigas 2022-08-20 02:01:46 +04:00
parent 804356a139
commit 23f06f73a2
2 changed files with 133 additions and 26 deletions

98
app.py
View File

@ -1,12 +1,94 @@
from flask import Flask, Response
from parsers.nvidia import *
from flask import Flask, Response, abort
from parsers.nvidia import NvidiaParser
import os, time, threading
class ParserData:
parser = None
rss = None
time = None
lock = threading.Lock()
def __init__(self, parser):
self.parser = parser
app = Flask(__name__)
@app.route("/")
def index():
return "<p>Hello, World!</p>"
parsers = [ParserData(NvidiaParser())]
def getCachePath(parser):
path = './_cache/' + parser.__class__.__name__
return path
def checkParserCache(parser):
path = getCachePath(parser.parser)
try:
os.path.getmtime(path)
with open(path, 'r') as f:
parser.rss = f.read()
except:
os.makedirs(os.path.dirname(path), exist_ok=True)
open(path, 'w').close()
return True
filetime = os.path.getmtime(path)
currtime = time.time()
parser.time = filetime
return (currtime - filetime) > parser.parser.CACHE_TIMEOUT
def updateParserAndCache(parser):
rss = parser.parser.getRss()
with parser.lock:
parser.rss = rss
parser.time = time.time()
with open(getCachePath(parser.parser), 'w') as f:
f.write(parser.rss.decode('utf-8'))
def runParserWorker(parser):
if checkParserCache(parser):
updateParserAndCache(parser)
while True:
nextmark = parser.time + parser.parser.CACHE_TIMEOUT
sleepfor = nextmark - time.time()
if sleepfor > 0:
time.sleep(nextmark - time.time())
updateParserAndCache(parser)
def runParserWorkers():
for parser in parsers:
threading.Thread(target=runParserWorker, args=(parser,)).start()
indexPage = """
<html>
<head>
<title>RSS index page</title>
</head>
<body>
<h1>RSS index page</h1>
<h3>Custom generators of RSS feed from different blogs</h3>
<br/>
"""
for parser in parsers:
indexPage += f'<a href={parser.parser.URL}>{parser.parser.NAME}</a>'
indexPage += '</body>'
indexPage += '</html>'
@app.route('/')
def index():
return indexPage
runParserWorkers()
for parser in parsers:
@app.route(parser.parser.URL)
def query():
with parser.lock:
if parser.rss == None:
abort(404)
return Response(parser.rss, mimetype='text/xml')
@app.route("/nvidia")
def rss_query():
return Response(parseNvidia(), mimetype='text/xml')

View File

@ -1,27 +1,52 @@
from pyquery import PyQuery as pq
from feedgen.feed import FeedGenerator
from collections import namedtuple
import multiprocessing
from datetime import datetime
def parseNvidia():
root_url = 'https://research.nvidia.com'
d = pq(root_url +'/publications')
Entry = namedtuple('Entry', 'url fe')
fg = FeedGenerator()
fg.id(root_url)
fg.title('NVidia Research')
fg.link(href=root_url, rel='alternate')
fg.logo(root_url + '/favicon.ico')
fg.description('NVidia Research papers')
class NvidiaParser:
NAME = 'NVidia Research'
URL = '/nvidia'
CACHE_TIMEOUT = 3600
for elem in d('.views-field-title').items():
link = elem.find('a')
url = root_url + link.attr.href
title = link.text()
def parseNvidiaDate(entry):
dom = pq(entry.url)
print(entry.url)
fe = fg.add_entry()
fe.id(url)
fe.title(title)
fe.link(href=url)
time = dom('.field--name-field-publication-date').find('time').attr.datetime
time = datetime.strptime(time, '%Y-%m-%dT%H:%M:%S%z')
entry.fe.pubDate(time)
return fg.rss_str()
def getRss(self):
root_url = 'https://research.nvidia.com'
d = pq(root_url +'/publications')
fg = FeedGenerator()
fg.id(root_url)
fg.title('NVidia Research')
fg.link(href=root_url, rel='alternate')
fg.logo(root_url + '/themes/custom/nvidia/favicon.ico')
fg.description('NVidia Research papers')
entries = []
print('RSS GOT')
for elem in d('.views-field-title').items():
link = elem.find('a')
url = root_url + link.attr.href
title = link.text()
fe = fg.add_entry()
fe.id(url)
fe.title(title)
fe.link(href=url)
entries.append(Entry(url, fe))
with multiprocessing.Pool(8) as p:
p.map(NvidiaParser.parseNvidiaDate, entries)
return fg.rss_str()