treyzania
/
storrowed.boston


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
							import os
import datetime
import urllib
import re

import http.cookiejar as cookielib
import newspaper
import nltk
from bs4 import BeautifulSoup
from requests import get

from sbenv import *

EXCL_DOMAINS = [
    '(www\.)?researchgate\.net',
    '(www\.)?businessyab.com',
    '.*twitter\.com',
    '.*quora.*',
    '.*\.ru',
]

# silly google
REWRITE_DOMAINS = [
    ('.*\.facebook\.com', 'www.facebook.com')
]

# silly google
EXCL_PATH_EXTS = [
    '.pdf',
    '.docx'
]

# If a page has any of these keywords then we drop it.
EXCL_KWS = [
    # for some reason it shows personal injury lawyers
    'injury',
    'attorney',
    'court',
    'claim',

    # repair shops
    'repair',
]

# If it has any of these then it's a hard confirm
PASS_KWS = [
    'storrow',
    'overpass',
]

def process_results(rtbl):
    rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
    excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]

    results = {}

    for qdate, rents in rtbl.items():
        for rent in rents:
            uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])

            # Check skip because of domain exclusion
            skip_cuz_url = False
            for reg in excl_domains:
                if reg.fullmatch(unl):
                    skip_cuz_url = True
                    break
            if skip_cuz_url:
                continue

            # Rewrite the domain, if applicable
            for reg, rw in rw_domains:
                if reg.fullmatch(unl):
                    unl = rw
                    break

            # Check skip because of URL path file extension
            skip_cuz_pathext = False
            for ext in EXCL_PATH_EXTS:
                if upath.endswith(ext):
                    skip_cuz_pathext = True
                    break
            if skip_cuz_pathext:
                continue

            has_kws = 'nkw' in rent

            # Check skip because of hard exclusion keywords
            skip_cuz_kws = False
            if has_kws:
                for kw in EXCL_KWS:
                    for akw in rent['nkw']:
                        if kw in akw:
                            skip_cuz_kws = True
                    if skip_cuz_kws:
                                break
                if skip_cuz_kws:
                    continue

            # Now characterize what kind of entry it is.
            has_date = 'nd' in rent
            has_title = 'nt' in rent
            has_pass_kw = False
            if has_kws:
                for kw in PASS_KWS:
                    for akw in rent['nkw']:
                        if kw in akw:
                            has_pass_kw = True
                    if has_pass_kw:
                        break

            # Try to assemble a record to store the thing.
            eff_date = rent['nd'] if has_date else qdate
            eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
            item = {
                'slug': gen_slug(rent['t'], eff_date),
                'url': eff_url,
                'gtitle': rent['t'],
                'title': rent['nt'] if has_title else None,
                'date': eff_date,
                'kws': rent['nkw'] if has_kws else None
            }

            if eff_date not in results:
                results[eff_date] = {
                    'pass': [],
                    'maybe': []
                }

            if has_pass_kw:
                results[eff_date]['pass'].append(item)
            else:
                results[eff_date]['maybe'].append(item)

    return results

SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+')

def gen_slug(title, date):
    norm_title = title.lower()
    title_part = '-'.join(m.group(0) for m in SLUG_CHUNKS.finditer(norm_title))
    return '%s-%s' % (date, title_part)

def query_range(startdate, numdays, preloadurls=None):
    cookiejar = load_cookiejar()

    oneday = datetime.timedelta(days=1)
    seenurls = set() if preloadurls is None else set(preloadurls)

    dateurls = {}

    for i in range(numdays):
        d = startdate + (oneday * i)
        print(d)
        qresults = query_for_date(d, cookiejar)

        dayresults = []
        for rurl, rtitle in qresults:
            if rurl in seenurls:
                continue
            seenurls.add(rurl)

            rent = {
                'u': rurl,
                't': rtitle,
            }

            art = None
            try:
                u = urllib.parse.urlparse(rurl)
                if (u.path == '/' or u.path == '') and u.params == '':
                    print('url is for website main page and has no params, probably not a news article:', rurl)
                    continue

                print('processing', rurl)
                a = newspaper.Article(rurl)
                a.download()
                a.parse()
                a.nlp()

                rent['nt'] = a.title
                try:
                    rent['nd'] = a.publish_date.strftime('%Y-%m-%d')
                except:
                    pass
                rent['nkw'] = a.keywords

            except Exception as e:
                print(str(e))

            dayresults.append(rent)

        dateurls[d.strftime('%Y-%m-%d')] = dayresults

    return dateurls

_query_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

def query_for_date(ondate, cookiejar):
    u = make_search_url(ondate)
    resp = get(u, headers=_query_headers, cookies=cookiejar)
    resp.raise_for_status()
    return list(parse_results(resp.text))

def make_search_url(ondate):
    params = {
        'q': '"storrow drive" truck accident bridge OR overpass',
        'hl': 'en',
        'tbs': create_range_param(ondate),
        'google_abuse': get_google_abuse_token()
    }

    query_part = urllib.parse.urlencode(params)
    return 'https://www.google.com/search?' + query_part

def create_range_param(ondate):
    datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ')
    return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr)

def parse_results(raw_html):
    soup = BeautifulSoup(raw_html, 'html.parser')
    result_block = soup.find_all('div', attrs={'class': 'g'})
    for result in result_block:
        link = result.find('a', href=True)
        title = result.find('h3')
        if link and title:
            yield (link['href'], title.text)

def prep_nltk():
    nltk.download('punkt')