|
|
@@ -157,47 +157,77 @@ def query_range(startdate, numdays, preloadurls=None): |
|
|
|
for i in range(numdays): |
|
|
|
d = startdate + (oneday * i) |
|
|
|
print(d) |
|
|
|
qresults = query_for_date(d, cookiejar) |
|
|
|
dres = _query_day_and_fetch(d, cookiejar, seenurls) |
|
|
|
dateurls[d.strftime('%Y-%m-%d')] = dres |
|
|
|
|
|
|
|
dayresults = [] |
|
|
|
for rurl, rtitle in qresults: |
|
|
|
if rurl in seenurls: |
|
|
|
continue |
|
|
|
seenurls.add(rurl) |
|
|
|
return dateurls |
|
|
|
|
|
|
|
def query_day(date, preloadurls=None): |
|
|
|
cookiejar = load_cookiejar() |
|
|
|
seenurls = set() if preloadurls is None else set(preloadurls) |
|
|
|
|
|
|
|
rent = { |
|
|
|
'u': rurl, |
|
|
|
't': rtitle, |
|
|
|
} |
|
|
|
res = _query_day_and_fetch(date, cookiejar, seenurls) |
|
|
|
return res |
|
|
|
|
|
|
|
art = None |
|
|
|
try: |
|
|
|
u = urllib.parse.urlparse(rurl) |
|
|
|
if (u.path == '/' or u.path == '') and u.params == '': |
|
|
|
print('url is for website main page and has no params, probably not a news article:', rurl) |
|
|
|
continue |
|
|
|
def _query_day_and_fetch(date, cookiejar, seenurls_mut): |
|
|
|
qresults = query_for_date(d, cookiejar) |
|
|
|
|
|
|
|
print('processing', rurl) |
|
|
|
a = newspaper.Article(rurl) |
|
|
|
a.download() |
|
|
|
a.parse() |
|
|
|
a.nlp() |
|
|
|
dayresults = [] |
|
|
|
for rurl, rtitle in qresults: |
|
|
|
if rurl in seenurls: |
|
|
|
continue |
|
|
|
seenurls_mut.add(rurl) |
|
|
|
|
|
|
|
rent['nt'] = a.title |
|
|
|
try: |
|
|
|
rent['nd'] = a.publish_date.strftime('%Y-%m-%d') |
|
|
|
except: |
|
|
|
pass |
|
|
|
rent['nkw'] = a.keywords |
|
|
|
rent = { |
|
|
|
'u': rurl, |
|
|
|
't': rtitle, |
|
|
|
} |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
print(str(e)) |
|
|
|
fa = _fetch_article(rurl, cookiejar) |
|
|
|
if fa is not None: |
|
|
|
rent.update(fa) |
|
|
|
|
|
|
|
dayresults.append(rent) |
|
|
|
dayresults.append(rent) |
|
|
|
|
|
|
|
dateurls[d.strftime('%Y-%m-%d')] = dayresults |
|
|
|
return dayresults |
|
|
|
|
|
|
|
return dateurls |
|
|
|
def fetch_article(url): |
|
|
|
cj = load_cookiejar() |
|
|
|
|
|
|
|
fa = _fetch_article(url, cj) |
|
|
|
if fa is None: |
|
|
|
return None |
|
|
|
|
|
|
|
fa['u'] = url |
|
|
|
fa['t'] = fa['nt'] |
|
|
|
|
|
|
|
return fa |
|
|
|
|
|
|
|
def _fetch_article(rurl, cookiejar): |
|
|
|
rent = {} |
|
|
|
try: |
|
|
|
u = urllib.parse.urlparse(rurl) |
|
|
|
if (u.path == '/' or u.path == '') and u.params == '': |
|
|
|
print('url is for website main page and has no params, probably not a news article:', rurl) |
|
|
|
return None |
|
|
|
|
|
|
|
print('processing', rurl) |
|
|
|
a = newspaper.Article(rurl) |
|
|
|
a.download() |
|
|
|
a.parse() |
|
|
|
a.nlp() |
|
|
|
|
|
|
|
rent['nt'] = a.title |
|
|
|
try: |
|
|
|
rent['nd'] = a.publish_date.strftime('%Y-%m-%d') |
|
|
|
except: |
|
|
|
pass |
|
|
|
rent['nkw'] = a.keywords |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
print(str(e)) |
|
|
|
|
|
|
|
return rent |
|
|
|
|
|
|
|
_query_headers = { |
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' |