|
|
@@ -17,6 +17,8 @@ EXCL_DOMAINS = [ |
|
|
|
'.*twitter\.com', |
|
|
|
'.*quora.*', |
|
|
|
'.*\.ru', |
|
|
|
'.*\.jp', |
|
|
|
'.*xn--.*' |
|
|
|
] |
|
|
|
|
|
|
|
# silly google |
|
|
@@ -53,7 +55,7 @@ def process_results(rtbl): |
|
|
|
excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS] |
|
|
|
results = {} |
|
|
|
for qdate, rents in rtbl.items(): |
|
|
|
results[qdate] = _process_day_result(rw_domains, excl_domains, qdate, rents) |
|
|
|
results[qdate] = _process_day_results(rw_domains, excl_domains, qdate, rents) |
|
|
|
return results |
|
|
|
|
|
|
|
def process_day_results(query_date, rents): |
|
|
@@ -88,7 +90,8 @@ def _process_day_results(rw_domains, excl_domains, qdate, rents): |
|
|
|
# Check skip because of URL path file extension |
|
|
|
skip_cuz_pathext = False |
|
|
|
for ext in EXCL_PATH_EXTS: |
|
|
|
if upath.endswith(ext): |
|
|
|
# Also look at the full url anyways. |
|
|
|
if upath.endswith(ext) or rent['u'].endswith(ext): |
|
|
|
skip_cuz_pathext = True |
|
|
|
break |
|
|
|
if skip_cuz_pathext: |
|
|
@@ -208,7 +211,7 @@ def _fetch_article(rurl, cookiejar): |
|
|
|
rent = {} |
|
|
|
try: |
|
|
|
u = urllib.parse.urlparse(rurl) |
|
|
|
if (u.path == '/' or u.path == '') and u.params == '': |
|
|
|
if (u.path == '/' or u.path == '') and u.params == '' and u.query == '': |
|
|
|
print('url is for website main page and has no params, probably not a news article:', rurl) |
|
|
|
return None |
|
|
|
|