2 anni fa · 5e3b4321c4
--- a/searchlib.py
+++ b/searchlib.py
@@ -17,6 +17,8 @@ EXCL_DOMAINS = [
    '.*twitter\.com',
    '.*quora.*',
    '.*\.ru',
    '.*\.jp',
    '.*xn--.*'
 ]

 # silly google
@@ -53,7 +55,7 @@ def process_results(rtbl):
    excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
    results = {}
    for qdate, rents in rtbl.items():
        results[qdate] = _process_day_result(rw_domains, excl_domains, qdate, rents)
        results[qdate] = _process_day_results(rw_domains, excl_domains, qdate, rents)
    return results

 def process_day_results(query_date, rents):
@@ -88,7 +90,8 @@ def _process_day_results(rw_domains, excl_domains, qdate, rents):
        # Check skip because of URL path file extension
        skip_cuz_pathext = False
        for ext in EXCL_PATH_EXTS:
            if upath.endswith(ext):
            # Also look at the full url anyways.
            if upath.endswith(ext) or rent['u'].endswith(ext):
                skip_cuz_pathext = True
                break
        if skip_cuz_pathext:
@@ -208,7 +211,7 @@ def _fetch_article(rurl, cookiejar):
    rent = {}
    try:
        u = urllib.parse.urlparse(rurl)
        if (u.path == '/' or u.path == '') and u.params == '':
        if (u.path == '/' or u.path == '') and u.params == '' and u.query == '':
            print('url is for website main page and has no params, probably not a news article:', rurl)
            return None