treyzania
/
storrowed.boston


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
							import os
import datetime
import urllib
import re

import http.cookiejar as cookielib
import newspaper
import nltk
from bs4 import BeautifulSoup
from requests import get

from sbenv import *

EXCL_DOMAINS = [
    '(www\.)?researchgate\.net',
    '(www\.)?businessyab.com',
    '.*twitter\.com',
    '.*quora.*',
    '.*\.ru',
    '.*\.jp',
    '.*xn--.*'
]

# silly google
REWRITE_DOMAINS = [
    ('.*\.facebook\.com', 'www.facebook.com')
]

# silly google
EXCL_PATH_EXTS = [
    '.pdf',
    '.docx'
]

# If a page has any of these keywords then we drop it.
EXCL_KWS = [
    # for some reason it shows personal injury lawyers
    'injury',
    'attorney',
    'court',
    'claim',

    # repair shops
    'repair',
]

# If it has any of these then it's a hard confirm
PASS_KWS = [
    'storrow',
    'overpass',
]

def process_results(rtbl):
    rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
    excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
    results = {}
    for qdate, rents in rtbl.items():
        results[qdate] = _process_day_results(rw_domains, excl_domains, qdate, rents)
    return results

def process_day_results(query_date, rents):
    rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
    excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
    return _process_day_results(rw_domains, excl_domains, query_date, rents)

def _process_day_results(rw_domains, excl_domains, qdate, rents):
    articles = {
        'pass': [],
        'maybe': [],
    }

    for rent in rents:
        uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])

        # Check skip because of domain exclusion
        skip_cuz_url = False
        for reg in excl_domains:
            if reg.fullmatch(unl):
                skip_cuz_url = True
                break
        if skip_cuz_url:
            continue

        # Rewrite the domain, if applicable
        for reg, rw in rw_domains:
            if reg.fullmatch(unl):
                unl = rw
                break

        # Check skip because of URL path file extension
        skip_cuz_pathext = False
        for ext in EXCL_PATH_EXTS:
            # Also look at the full url anyways.
            if upath.endswith(ext) or rent['u'].endswith(ext):
                skip_cuz_pathext = True
                break
        if skip_cuz_pathext:
            continue

        has_kws = 'nkw' in rent

        # Check skip because of hard exclusion keywords
        skip_cuz_kws = False
        if has_kws:
            for kw in EXCL_KWS:
                for akw in rent['nkw']:
                    if kw in akw:
                        skip_cuz_kws = True
                if skip_cuz_kws:
                    break
            if skip_cuz_kws:
                continue

        # Now characterize what kind of entry it is.
        has_date = 'nd' in rent
        has_title = 'nt' in rent
        has_pass_kw = False
        if has_kws:
            for kw in PASS_KWS:
                for akw in rent['nkw']:
                    if kw in akw:
                        has_pass_kw = True
                if has_pass_kw:
                    break

        # Try to assemble a record to store the thing.
        eff_date = rent['nd'] if has_date else qdate
        eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
        item = {
            'slug': gen_slug(rent['t'], eff_date),
            'url': eff_url,
            'gtitle': rent['t'],
            'title': rent['nt'] if has_title else None,
            'date': eff_date,
            'kws': rent['nkw'] if has_kws else None
        }

        if has_pass_kw:
            articles['pass'].append(item)
        else:
            articles['maybe'].append(item)

    return articles

SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+')

def gen_slug(title, date):
    norm_title = title.lower()
    title_part = '-'.join(m.group(0) for m in SLUG_CHUNKS.finditer(norm_title))
    return '%s-%s' % (date, title_part)

def query_range(startdate, numdays, preloadurls=None):
    cookiejar = load_cookiejar()

    oneday = datetime.timedelta(days=1)
    seenurls = set() if preloadurls is None else set(preloadurls)

    dateurls = {}

    for i in range(numdays):
        d = startdate + (oneday * i)
        print(d)
        dres = _query_day_and_fetch(d, cookiejar, seenurls)
        dateurls[d.strftime('%Y-%m-%d')] = dres

    return dateurls

def query_day(date, preloadurls=None):
    cookiejar = load_cookiejar()
    seenurls = set() if preloadurls is None else set(preloadurls)

    res = _query_day_and_fetch(date, cookiejar, seenurls)
    return res

def _query_day_and_fetch(date, cookiejar, seenurls_mut):
    qresults = query_for_date(date, cookiejar)

    dayresults = []
    for rurl, rtitle in qresults:
        if rurl in seenurls_mut:
            continue
        seenurls_mut.add(rurl)

        rent = {
            'u': rurl,
            't': rtitle,
        }

        fa = _fetch_article(rurl, cookiejar)
        if fa is not None:
            rent.update(fa)

        dayresults.append(rent)

    return dayresults

def fetch_article(url):
    cj = load_cookiejar()

    fa = _fetch_article(url, cj)
    if fa is None:
        return None

    fa['u'] = url
    if 'nt' in fa:
        fa['t'] = fa['nt']

    return fa

def _fetch_article(rurl, cookiejar):
    rent = {}
    try:
        u = urllib.parse.urlparse(rurl)
        if (u.path == '/' or u.path == '') and u.params == '' and u.query == '':
            print('url is for website main page and has no params, probably not a news article:', rurl)
            return None

        print('processing', rurl)
        a = newspaper.Article(rurl)
        a.download()
        a.parse()
        a.nlp()

        rent['nt'] = a.title
        try:
            rent['nd'] = a.publish_date.strftime('%Y-%m-%d')
        except:
            pass
        rent['nkw'] = a.keywords

    except Exception as e:
        print(str(e))

    return rent

_query_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

def query_for_date(ondate, cookiejar):
    u = make_search_url(ondate)
    resp = get(u, headers=_query_headers, cookies=cookiejar)
    resp.raise_for_status()
    return list(parse_results(resp.text))

def make_search_url(ondate):
    params = {
        'q': '"storrow drive" truck accident bridge OR overpass',
        'hl': 'en',
        'tbs': create_range_param(ondate),
        'google_abuse': get_google_abuse_token()
    }

    query_part = urllib.parse.urlencode(params)
    return 'https://www.google.com/search?' + query_part

def create_range_param(ondate):
    datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ')
    return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr)

def parse_results(raw_html):
    soup = BeautifulSoup(raw_html, 'html.parser')
    result_block = soup.find_all('div', attrs={'class': 'g'})
    for result in result_block:
        link = result.find('a', href=True)
        title = result.find('h3')
        if link and title:
            yield (link['href'], title.text)

def prep_nltk():
    nltk.download('punkt')