import os import datetime import urllib import re import http.cookiejar as cookielib import newspaper import nltk from bs4 import BeautifulSoup from requests import get from sbenv import * EXCL_DOMAINS = [ '(www\.)?researchgate\.net', '(www\.)?businessyab.com', '.*twitter\.com', '.*quora.*' ] # silly google REWRITE_DOMAINS = [ ('.*\.facebook\.com', 'www.facebook.com') ] # silly google EXCL_PATH_EXTS = [ '.pdf', '.docx' ] # If a page has any of these keywords then we drop it. EXCL_KWS = [ # for some reason it shows personal injury lawyers 'injury', 'attorney', 'court', 'claim', # repair shops 'repair', ] # If it has any of these then it's a hard confirm PASS_KWS = [ 'storrow', 'overpass', ] def process_results(rtbl): rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS] excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS] results = {} for qdate, rents in rtbl.items(): for rent in rents: uscheme, unl, upath, uq, ufrag = urllib.parse.urlsplit(rent['u']) # Check skip because of domain exclusion skip_cuz_url = False for reg in excl_domains: if reg.fullmatch(unl): skip_cuz_url = True break if skip_cuz_url: continue # Rewrite the domain, if applicable for reg, rw in rw_domains: if reg.fullmatch(unl): unl = rw break # Check skip because of URL path file extension skip_cuz_pathext = False for ext in EXCL_PATH_EXTS: if upath.endswith(ext): skip_cuz_pathext = True break if skip_cuz_pathext: continue has_kws = 'nkw' in rent # Check skip because of hard exclusion keywords skip_cuz_kws = False if has_kws: for kw in EXCL_KWS: for akw in rent['nkw']: if kw in akw: skip_cuz_kws = True if skip_cuz_kws: break if skip_cuz_kws: continue # Now characterize what kind of entry it is. has_date = 'nd' in rent has_title = 'nt' in rent has_pass_kw = False if has_kws: for kw in PASS_KWS: for akw in rent['nkw']: if kw in akw: has_pass_kw = True if has_pass_kw: break # Try to assemble a record to store the thing. eff_date = rent['nd'] if has_date else qdate eff_url = urllib.parse.urlunparse([uscheme, unl, upath, uq, ufrag, '']) item = { 'slug': gen_slug(rent['t'], eff_date), 'url': eff_url, 'gtitle': rent['t'], 'title': rent['nt'] if has_title else None, 'date': eff_date, 'kws': rent['nkw'] if has_kws else None } if eff_date not in results: results[eff_date] = { 'pass': [], 'maybe': [] } if has_pass_kw: results[eff_date]['pass'].append(item) else: results[eff_date]['maybe'].append(item) return results SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+') def gen_slug(title, date): norm_title = title.lower() title_part = '-'.join(m.group(0) for m in SLUG_CHUNKS.finditer(norm_title)) return '%s-%s' % (date, title_part) def query_range(startdate, numdays, preloadurls=None): cookiejar = load_cookiejar() oneday = datetime.timedelta(days=1) seenurls = set() if preloadurls is None else set(preloadurls) dateurls = {} for i in range(numdays): d = startdate + (oneday * i) print(d) qresults = query_for_date(d, cookiejar) dayresults = [] for rurl, rtitle in qresults: if rurl in seenurls: continue seenurls.add(rurl) rent = { 'u': rurl, 't': rtitle, } art = None try: print('processing', rurl) a = newspaper.Article(rurl) a.download() a.parse() a.nlp() rent['nt'] = a.title try: rent['nd'] = a.publish_date.strftime('%Y-%m-%d') except: pass rent['nkw'] = a.keywords except Exception as e: print(str(e)) dayresults.append(rent) dateurls[d.strftime('%Y-%m-%d')] = dayresults return dateurls _query_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } def query_for_date(ondate, cookiejar): u = make_search_url(ondate) resp = get(u, headers=_query_headers, cookies=cookiejar) resp.raise_for_status() return list(parse_results(resp.text)) def make_search_url(ondate): params = { 'q': '"storrow drive" truck accident bridge OR overpass', 'hl': 'en', 'tbs': create_range_param(ondate), 'google_abuse': get_google_abuse_token() } query_part = urllib.parse.urlencode(params) return 'https://www.google.com/search?' + query_part def create_range_param(ondate): datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ') return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr) def parse_results(raw_html): soup = BeautifulSoup(raw_html, 'html.parser') result_block = soup.find_all('div', attrs={'class': 'g'}) for result in result_block: link = result.find('a', href=True) title = result.find('h3') if link and title: yield (link['href'], title.text) def prep_nltk(): nltk.download('punkt')