2 years ago · 5e3b4321c4
--- a/searchlib.py
+++ b/searchlib.py
    '.*twitter\.com',
    '.*quora.*',
    '.*\.ru',
    '.*\.jp',
    '.*xn--.*'
 ]
 # silly google
    excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
    results = {}
    for qdate, rents in rtbl.items():
        results[qdate] = _process_day_result(rw_domains, excl_domains, qdate, rents)
        results[qdate] = _process_day_results(rw_domains, excl_domains, qdate, rents)
    return results
 def process_day_results(query_date, rents):
        # Check skip because of URL path file extension
        skip_cuz_pathext = False
        for ext in EXCL_PATH_EXTS:
            if upath.endswith(ext):
            # Also look at the full url anyways.
            if upath.endswith(ext) or rent['u'].endswith(ext):
                skip_cuz_pathext = True
                break
        if skip_cuz_pathext:
    rent = {}
    try:
        u = urllib.parse.urlparse(rurl)
        if (u.path == '/' or u.path == '') and u.params == '':
        if (u.path == '/' or u.path == '') and u.params == '' and u.query == '':
            print('url is for website main page and has no params, probably not a news article:', rurl)
            return None