diff --git a/app.py b/app.py index 6f0e41a..4a90e70 100644 --- a/app.py +++ b/app.py @@ -1,12 +1,94 @@ -from flask import Flask, Response -from parsers.nvidia import * +from flask import Flask, Response, abort +from parsers.nvidia import NvidiaParser +import os, time, threading + +class ParserData: + parser = None + rss = None + time = None + lock = threading.Lock() + + def __init__(self, parser): + self.parser = parser app = Flask(__name__) -@app.route("/") -def index(): - return "

Hello, World!

" +parsers = [ParserData(NvidiaParser())] + +def getCachePath(parser): + path = './_cache/' + parser.__class__.__name__ + return path + +def checkParserCache(parser): + path = getCachePath(parser.parser) + + try: + os.path.getmtime(path) + with open(path, 'r') as f: + parser.rss = f.read() + except: + os.makedirs(os.path.dirname(path), exist_ok=True) + open(path, 'w').close() + return True + + filetime = os.path.getmtime(path) + currtime = time.time() + parser.time = filetime + + return (currtime - filetime) > parser.parser.CACHE_TIMEOUT + +def updateParserAndCache(parser): + rss = parser.parser.getRss() + + with parser.lock: + parser.rss = rss + parser.time = time.time() + + with open(getCachePath(parser.parser), 'w') as f: + f.write(parser.rss.decode('utf-8')) + +def runParserWorker(parser): + if checkParserCache(parser): + updateParserAndCache(parser) + + while True: + nextmark = parser.time + parser.parser.CACHE_TIMEOUT + sleepfor = nextmark - time.time() + if sleepfor > 0: + time.sleep(nextmark - time.time()) + updateParserAndCache(parser) + +def runParserWorkers(): + for parser in parsers: + threading.Thread(target=runParserWorker, args=(parser,)).start() + +indexPage = """ + + +RSS index page + + +

RSS index page

+

Custom generators of RSS feed from different blogs

+
+""" + +for parser in parsers: + indexPage += f'{parser.parser.NAME}' +indexPage += '' +indexPage += '' + +@app.route('/') +def index(): + return indexPage + +runParserWorkers() + +for parser in parsers: + @app.route(parser.parser.URL) + def query(): + with parser.lock: + if parser.rss == None: + abort(404) + return Response(parser.rss, mimetype='text/xml') -@app.route("/nvidia") -def rss_query(): - return Response(parseNvidia(), mimetype='text/xml') diff --git a/parsers/nvidia.py b/parsers/nvidia.py index 66d8131..97b1359 100644 --- a/parsers/nvidia.py +++ b/parsers/nvidia.py @@ -1,27 +1,52 @@ from pyquery import PyQuery as pq from feedgen.feed import FeedGenerator +from collections import namedtuple +import multiprocessing +from datetime import datetime -def parseNvidia(): - root_url = 'https://research.nvidia.com' - d = pq(root_url +'/publications') +Entry = namedtuple('Entry', 'url fe') - fg = FeedGenerator() - fg.id(root_url) - fg.title('NVidia Research') - fg.link(href=root_url, rel='alternate') - fg.logo(root_url + '/favicon.ico') - fg.description('NVidia Research papers') +class NvidiaParser: + NAME = 'NVidia Research' + URL = '/nvidia' + CACHE_TIMEOUT = 3600 - for elem in d('.views-field-title').items(): - link = elem.find('a') - url = root_url + link.attr.href - title = link.text() + def parseNvidiaDate(entry): + dom = pq(entry.url) + print(entry.url) - fe = fg.add_entry() - fe.id(url) - fe.title(title) - fe.link(href=url) + time = dom('.field--name-field-publication-date').find('time').attr.datetime + time = datetime.strptime(time, '%Y-%m-%dT%H:%M:%S%z') + entry.fe.pubDate(time) - return fg.rss_str() + def getRss(self): + root_url = 'https://research.nvidia.com' + d = pq(root_url +'/publications') + fg = FeedGenerator() + fg.id(root_url) + fg.title('NVidia Research') + fg.link(href=root_url, rel='alternate') + fg.logo(root_url + '/themes/custom/nvidia/favicon.ico') + fg.description('NVidia Research papers') + + entries = [] + print('RSS GOT') + + for elem in d('.views-field-title').items(): + link = elem.find('a') + url = root_url + link.attr.href + title = link.text() + + fe = fg.add_entry() + fe.id(url) + fe.title(title) + fe.link(href=url) + + entries.append(Entry(url, fe)) + + with multiprocessing.Pool(8) as p: + p.map(NvidiaParser.parseNvidiaDate, entries) + + return fg.rss_str()