123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226 |
- import os
- import datetime
- import urllib
- import re
-
- import http.cookiejar as cookielib
- import newspaper
- import nltk
- from bs4 import BeautifulSoup
- from requests import get
-
- from sbenv import *
-
- EXCL_DOMAINS = [
- '(www\.)?researchgate\.net',
- '(www\.)?businessyab.com',
- '.*twitter\.com',
- '.*quora.*'
- ]
-
- # silly google
- REWRITE_DOMAINS = [
- ('.*\.facebook\.com', 'www.facebook.com')
- ]
-
- # silly google
- EXCL_PATH_EXTS = [
- '.pdf',
- '.docx'
- ]
-
- # If a page has any of these keywords then we drop it.
- EXCL_KWS = [
- # for some reason it shows personal injury lawyers
- 'injury',
- 'attorney',
- 'court',
- 'claim',
-
- # repair shops
- 'repair',
- ]
-
- # If it has any of these then it's a hard confirm
- PASS_KWS = [
- 'storrow',
- 'overpass',
- ]
-
- def process_results(rtbl):
- rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
- excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
-
- results = {}
-
- for qdate, rents in rtbl.items():
- for rent in rents:
- uscheme, unl, upath, uq, ufrag = urllib.parse.urlsplit(rent['u'])
-
- # Check skip because of domain exclusion
- skip_cuz_url = False
- for reg in excl_domains:
- if reg.fullmatch(unl):
- skip_cuz_url = True
- break
- if skip_cuz_url:
- continue
-
- # Rewrite the domain, if applicable
- for reg, rw in rw_domains:
- if reg.fullmatch(unl):
- unl = rw
- break
-
- # Check skip because of URL path file extension
- skip_cuz_pathext = False
- for ext in EXCL_PATH_EXTS:
- if upath.endswith(ext):
- skip_cuz_pathext = True
- break
- if skip_cuz_pathext:
- continue
-
- has_kws = 'nkw' in rent
-
- # Check skip because of hard exclusion keywords
- skip_cuz_kws = False
- if has_kws:
- for kw in EXCL_KWS:
- for akw in rent['nkw']:
- if kw in akw:
- skip_cuz_kws = True
- if skip_cuz_kws:
- break
- if skip_cuz_kws:
- continue
-
- # Now characterize what kind of entry it is.
- has_date = 'nd' in rent
- has_title = 'nt' in rent
- has_pass_kw = False
- if has_kws:
- for kw in PASS_KWS:
- for akw in rent['nkw']:
- if kw in akw:
- has_pass_kw = True
- if has_pass_kw:
- break
-
- # Try to assemble a record to store the thing.
- eff_date = rent['nd'] if has_date else qdate
- eff_url = urllib.parse.urlunparse([uscheme, unl, upath, uq, ufrag, ''])
- item = {
- 'slug': gen_slug(rent['t'], eff_date),
- 'url': eff_url,
- 'gtitle': rent['t'],
- 'title': rent['nt'] if has_title else None,
- 'date': eff_date,
- 'kws': rent['nkw'] if has_kws else None
- }
-
- if eff_date not in results:
- results[eff_date] = {
- 'pass': [],
- 'maybe': []
- }
-
- if has_pass_kw:
- results[eff_date]['pass'].append(item)
- else:
- results[eff_date]['maybe'].append(item)
-
- return results
-
- SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+')
-
- def gen_slug(title, date):
- norm_title = title.lower()
- title_part = '-'.join(m.group(0) for m in SLUG_CHUNKS.finditer(norm_title))
- return '%s-%s' % (date, title_part)
-
- def query_range(startdate, numdays, preloadurls=None):
- cookiejar = load_cookiejar()
-
- oneday = datetime.timedelta(days=1)
- seenurls = set() if preloadurls is None else set(preloadurls)
-
- dateurls = {}
-
- for i in range(numdays):
- d = startdate + (oneday * i)
- print(d)
- qresults = query_for_date(d, cookiejar)
-
- dayresults = []
- for rurl, rtitle in qresults:
- if rurl in seenurls:
- continue
- seenurls.add(rurl)
-
- rent = {
- 'u': rurl,
- 't': rtitle,
- }
-
- art = None
- try:
- print('processing', rurl)
- a = newspaper.Article(rurl)
- a.download()
- a.parse()
- a.nlp()
-
- rent['nt'] = a.title
- try:
- rent['nd'] = a.publish_date.strftime('%Y-%m-%d')
- except:
- pass
- rent['nkw'] = a.keywords
-
- except Exception as e:
- print(str(e))
-
- dayresults.append(rent)
-
- dateurls[d.strftime('%Y-%m-%d')] = dayresults
-
- return dateurls
-
- _query_headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
- }
-
- def query_for_date(ondate, cookiejar):
- u = make_search_url(ondate)
- resp = get(u, headers=_query_headers, cookies=cookiejar)
- resp.raise_for_status()
- return list(parse_results(resp.text))
-
- def make_search_url(ondate):
- params = {
- 'q': '"storrow drive" truck accident bridge OR overpass',
- 'hl': 'en',
- 'tbs': create_range_param(ondate),
- 'google_abuse': get_google_abuse_token()
- }
-
- query_part = urllib.parse.urlencode(params)
- return 'https://www.google.com/search?' + query_part
-
- def create_range_param(ondate):
- datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ')
- return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr)
-
- def parse_results(raw_html):
- soup = BeautifulSoup(raw_html, 'html.parser')
- result_block = soup.find_all('div', attrs={'class': 'g'})
- for result in result_block:
- link = result.find('a', href=True)
- title = result.find('h3')
- if link and title:
- yield (link['href'], title.text)
-
- def prep_nltk():
- nltk.download('punkt')
|