Browse Source

Changes to searchlib to make it more aggressive.

master
Trey Del Bonis 2 years ago
parent
commit
5e3b4321c4
1 changed files with 6 additions and 3 deletions
  1. 6
    3
      searchlib.py

+ 6
- 3
searchlib.py View File

'.*twitter\.com', '.*twitter\.com',
'.*quora.*', '.*quora.*',
'.*\.ru', '.*\.ru',
'.*\.jp',
'.*xn--.*'
] ]


# silly google # silly google
excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS] excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
results = {} results = {}
for qdate, rents in rtbl.items(): for qdate, rents in rtbl.items():
results[qdate] = _process_day_result(rw_domains, excl_domains, qdate, rents)
results[qdate] = _process_day_results(rw_domains, excl_domains, qdate, rents)
return results return results


def process_day_results(query_date, rents): def process_day_results(query_date, rents):
# Check skip because of URL path file extension # Check skip because of URL path file extension
skip_cuz_pathext = False skip_cuz_pathext = False
for ext in EXCL_PATH_EXTS: for ext in EXCL_PATH_EXTS:
if upath.endswith(ext):
# Also look at the full url anyways.
if upath.endswith(ext) or rent['u'].endswith(ext):
skip_cuz_pathext = True skip_cuz_pathext = True
break break
if skip_cuz_pathext: if skip_cuz_pathext:
rent = {} rent = {}
try: try:
u = urllib.parse.urlparse(rurl) u = urllib.parse.urlparse(rurl)
if (u.path == '/' or u.path == '') and u.params == '':
if (u.path == '/' or u.path == '') and u.params == '' and u.query == '':
print('url is for website main page and has no params, probably not a news article:', rurl) print('url is for website main page and has no params, probably not a news article:', rurl)
return None return None



Loading…
Cancel
Save