Browse Source

Changes to searchlib to make it more aggressive.

master
Trey Del Bonis 2 years ago
parent
commit
5e3b4321c4
1 changed files with 6 additions and 3 deletions
  1. 6
    3
      searchlib.py

+ 6
- 3
searchlib.py View File

@@ -17,6 +17,8 @@ EXCL_DOMAINS = [
'.*twitter\.com',
'.*quora.*',
'.*\.ru',
'.*\.jp',
'.*xn--.*'
]

# silly google
@@ -53,7 +55,7 @@ def process_results(rtbl):
excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
results = {}
for qdate, rents in rtbl.items():
results[qdate] = _process_day_result(rw_domains, excl_domains, qdate, rents)
results[qdate] = _process_day_results(rw_domains, excl_domains, qdate, rents)
return results

def process_day_results(query_date, rents):
@@ -88,7 +90,8 @@ def _process_day_results(rw_domains, excl_domains, qdate, rents):
# Check skip because of URL path file extension
skip_cuz_pathext = False
for ext in EXCL_PATH_EXTS:
if upath.endswith(ext):
# Also look at the full url anyways.
if upath.endswith(ext) or rent['u'].endswith(ext):
skip_cuz_pathext = True
break
if skip_cuz_pathext:
@@ -208,7 +211,7 @@ def _fetch_article(rurl, cookiejar):
rent = {}
try:
u = urllib.parse.urlparse(rurl)
if (u.path == '/' or u.path == '') and u.params == '':
if (u.path == '/' or u.path == '') and u.params == '' and u.query == '':
print('url is for website main page and has no params, probably not a news article:', rurl)
return None


Loading…
Cancel
Save