from pyquery import PyQuery as pq from feedgen.feed import FeedGenerator from collections import namedtuple import multiprocessing from datetime import datetime Entry = namedtuple('Entry', 'url fe') class NvidiaParser: NAME = 'NVidia Research' URL = '/nvidia' CACHE_TIMEOUT = 3600 def parseNvidiaDate(entry): dom = pq(entry.url) print(entry.url) time = dom('.field--name-field-publication-date').find('time').attr.datetime time = datetime.strptime(time, '%Y-%m-%dT%H:%M:%S%z') entry.fe.pubDate(time) def getRss(self): root_url = 'https://research.nvidia.com' d = pq(root_url +'/publications') fg = FeedGenerator() fg.id(root_url) fg.title('NVidia Research') fg.link(href=root_url, rel='alternate') fg.logo(root_url + '/themes/custom/nvidia/favicon.ico') fg.description('NVidia Research papers') entries = [] print('RSS GOT') for elem in d('.views-field-title').items(): link = elem.find('a') url = root_url + link.attr.href title = link.text() fe = fg.add_entry() fe.id(url) fe.title(title) fe.link(href=url) entries.append(Entry(url, fe)) with multiprocessing.Pool(8) as p: p.map(NvidiaParser.parseNvidiaDate, entries) return fg.rss_str()