import os import datetime import urllib import re import http.cookiejar as cookielib import newspaper import nltk from bs4 import BeautifulSoup from requests import get from sbenv import * EXCL_DOMAINS = [ '(www\.)?researchgate\.net', '(www\.)?businessyab.com', '.*twitter\.com', '.*quora.*', '.*\.ru', '.*\.jp', '.*xn--.*' ] # silly google REWRITE_DOMAINS = [ ('.*\.facebook\.com', 'www.facebook.com') ] # silly google EXCL_PATH_EXTS = [ '.pdf', '.docx' ] # If a page has any of these keywords then we drop it. EXCL_KWS = [ # for some reason it shows personal injury lawyers 'injury', 'attorney', 'court', 'claim', # repair shops 'repair', ] # If it has any of these then it's a hard confirm PASS_KWS = [ 'storrow', 'overpass', ] def process_results(rtbl): rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS] excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS] results = {} for qdate, rents in rtbl.items(): results[qdate] = _process_day_results(rw_domains, excl_domains, qdate, rents) return results def process_day_results(query_date, rents): rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS] excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS] return _process_day_results(rw_domains, excl_domains, query_date, rents) def _process_day_results(rw_domains, excl_domains, qdate, rents): articles = { 'pass': [], 'maybe': [], } for rent in rents: uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u']) # Check skip because of domain exclusion skip_cuz_url = False for reg in excl_domains: if reg.fullmatch(unl): skip_cuz_url = True break if skip_cuz_url: continue # Rewrite the domain, if applicable for reg, rw in rw_domains: if reg.fullmatch(unl): unl = rw break # Check skip because of URL path file extension skip_cuz_pathext = False for ext in EXCL_PATH_EXTS: # Also look at the full url anyways. if upath.endswith(ext) or rent['u'].endswith(ext): skip_cuz_pathext = True break if skip_cuz_pathext: continue has_kws = 'nkw' in rent # Check skip because of hard exclusion keywords skip_cuz_kws = False if has_kws: for kw in EXCL_KWS: for akw in rent['nkw']: if kw in akw: skip_cuz_kws = True if skip_cuz_kws: break if skip_cuz_kws: continue # Now characterize what kind of entry it is. has_date = 'nd' in rent has_title = 'nt' in rent has_pass_kw = False if has_kws: for kw in PASS_KWS: for akw in rent['nkw']: if kw in akw: has_pass_kw = True if has_pass_kw: break # Try to assemble a record to store the thing. eff_date = rent['nd'] if has_date else qdate eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, '']) item = { 'slug': gen_slug(rent['t'], eff_date), 'url': eff_url, 'gtitle': rent['t'], 'title': rent['nt'] if has_title else None, 'date': eff_date, 'kws': rent['nkw'] if has_kws else None } if has_pass_kw: articles['pass'].append(item) else: articles['maybe'].append(item) return articles SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+') def gen_slug(title, date): norm_title = title.lower() title_part = '-'.join(m.group(0) for m in SLUG_CHUNKS.finditer(norm_title)) return '%s-%s' % (date, title_part) def query_range(startdate, numdays, preloadurls=None): cookiejar = load_cookiejar() oneday = datetime.timedelta(days=1) seenurls = set() if preloadurls is None else set(preloadurls) dateurls = {} for i in range(numdays): d = startdate + (oneday * i) print(d) dres = _query_day_and_fetch(d, cookiejar, seenurls) dateurls[d.strftime('%Y-%m-%d')] = dres return dateurls def query_day(date, preloadurls=None): cookiejar = load_cookiejar() seenurls = set() if preloadurls is None else set(preloadurls) res = _query_day_and_fetch(date, cookiejar, seenurls) return res def _query_day_and_fetch(date, cookiejar, seenurls_mut): qresults = query_for_date(date, cookiejar) dayresults = [] for rurl, rtitle in qresults: if rurl in seenurls_mut: continue seenurls_mut.add(rurl) rent = { 'u': rurl, 't': rtitle, } fa = _fetch_article(rurl, cookiejar) if fa is not None: rent.update(fa) dayresults.append(rent) return dayresults def fetch_article(url): cj = load_cookiejar() fa = _fetch_article(url, cj) if fa is None: return None fa['u'] = url if 'nt' in fa: fa['t'] = fa['nt'] return fa def _fetch_article(rurl, cookiejar): rent = {} try: u = urllib.parse.urlparse(rurl) if (u.path == '/' or u.path == '') and u.params == '' and u.query == '': print('url is for website main page and has no params, probably not a news article:', rurl) return None print('processing', rurl) a = newspaper.Article(rurl) a.download() a.parse() a.nlp() rent['nt'] = a.title try: rent['nd'] = a.publish_date.strftime('%Y-%m-%d') except: pass rent['nkw'] = a.keywords except Exception as e: print(str(e)) return rent _query_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } def query_for_date(ondate, cookiejar): u = make_search_url(ondate) resp = get(u, headers=_query_headers, cookies=cookiejar) resp.raise_for_status() return list(parse_results(resp.text)) def make_search_url(ondate): params = { 'q': '"storrow drive" truck accident bridge OR overpass', 'hl': 'en', 'tbs': create_range_param(ondate), 'google_abuse': get_google_abuse_token() } query_part = urllib.parse.urlencode(params) return 'https://www.google.com/search?' + query_part def create_range_param(ondate): datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ') return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr) def parse_results(raw_html): soup = BeautifulSoup(raw_html, 'html.parser') result_block = soup.find_all('div', attrs={'class': 'g'}) for result in result_block: link = result.find('a', href=True) title = result.find('h3') if link and title: yield (link['href'], title.text) def prep_nltk(): nltk.download('punkt')