treyzania
/
storrowed.boston


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
							import os
import datetime
import urllib

import http.cookiejar as cookielib
import newspaper
import nltk
from bs4 import BeautifulSoup
from requests import get

from sbenv import *

EXCL_DOMAINS = [
    '(www\.)?researchgate\.net',
    '(www\.)?businessyab.com'
]

# silly google
REWRITE_DOMAINS = [
    ('.*\.facebook\.com', 'www.facebook.com')
]

# silly google
EXCL_PATH_EXTS = [
    '.pdf',
    '.docx'
]

# If a page has any of these keywords then we drop it.
EXCL_KWS = [
    # for some reason it shows personal injury lawyers
    'injury',
    'attorney',
    'court',
    'claim',

    # repair shops
    'repair',
]

# If it has any of these then it's a hard confirm
PASS_KW = [
    'storrow',
    'storrowed',
    'overpass',
    'bridge',
]

def query_range(startdate, numdays):
    cookiejar = load_cookiejar()

    oneday = datetime.timedelta(days=1)
    seenurls = set()

    dateurls = {}

    for i in range(numdays):
        d = startdate + (oneday * i)
        print(d)
        qresults = query_for_date(d, cookiejar)

        dayresults = []
        for rurl, rtitle in qresults:
            if rurl in seenurls:
                continue
            seenurls.add(rurl)

            rent = {
                'u': rurl,
                't': rtitle,
            }

            art = None
            try:
                print('processing', rurl)
                a = newspaper.Article(rurl)
                a.download()
                a.parse()
                a.nlp()

                rent['nt'] = a.title
                try:
                    rent['nd'] = a.publish_date.strftime('%Y-%m-%d')
                except:
                    pass
                rent['nkw'] = a.keywords

            except Exception as e:
                print(str(e))

            dayresults.append(rent)

        dateurls[d.strftime('%Y-%m-%d')] = dayresults

    return dateurls

_query_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

def query_for_date(ondate, cookiejar):
    u = make_search_url(ondate)
    resp = get(u, headers=_query_headers, cookies=cookiejar)
    resp.raise_for_status()
    return list(parse_results(resp.text))

def make_search_url(ondate):
    params = {
        'q': '"storrow drive" truck accident bridge OR overpass',
        'hl': 'en',
        'tbs': create_range_param(ondate),
        'google_abuse': get_google_abuse_token()
    }

    query_part = urllib.parse.urlencode(params)
    return 'https://www.google.com/search?' + query_part

def create_range_param(ondate):
    datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ')
    return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr)

def parse_results(raw_html):
    soup = BeautifulSoup(raw_html, 'html.parser')
    result_block = soup.find_all('div', attrs={'class': 'g'})
    for result in result_block:
        link = result.find('a', href=True)
        title = result.find('h3')
        if link and title:
            yield (link['href'], title.text)

def prep_nltk():
    nltk.download('punkt')