123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- import os
- import datetime
- import urllib
-
- import http.cookiejar as cookielib
- import newspaper
- import nltk
- from bs4 import BeautifulSoup
- from requests import get
-
- from sbenv import *
-
- EXCL_DOMAINS = [
- '(www\.)?researchgate\.net',
- '(www\.)?businessyab.com'
- ]
-
- # silly google
- REWRITE_DOMAINS = [
- ('.*\.facebook\.com', 'www.facebook.com')
- ]
-
- # silly google
- EXCL_PATH_EXTS = [
- '.pdf',
- '.docx'
- ]
-
- # If a page has any of these keywords then we drop it.
- EXCL_KWS = [
- # for some reason it shows personal injury lawyers
- 'injury',
- 'attorney',
- 'court',
- 'claim',
-
- # repair shops
- 'repair',
- ]
-
- # If it has any of these then it's a hard confirm
- PASS_KW = [
- 'storrow',
- 'storrowed',
- 'overpass',
- 'bridge',
- ]
-
- def query_range(startdate, numdays):
- cookiejar = load_cookiejar()
-
- oneday = datetime.timedelta(days=1)
- seenurls = set()
-
- dateurls = {}
-
- for i in range(numdays):
- d = startdate + (oneday * i)
- print(d)
- qresults = query_for_date(d, cookiejar)
-
- dayresults = []
- for rurl, rtitle in qresults:
- if rurl in seenurls:
- continue
- seenurls.add(rurl)
-
- rent = {
- 'u': rurl,
- 't': rtitle,
- }
-
- art = None
- try:
- print('processing', rurl)
- a = newspaper.Article(rurl)
- a.download()
- a.parse()
- a.nlp()
-
- rent['nt'] = a.title
- try:
- rent['nd'] = a.publish_date.strftime('%Y-%m-%d')
- except:
- pass
- rent['nkw'] = a.keywords
-
- except Exception as e:
- print(str(e))
-
- dayresults.append(rent)
-
- dateurls[d.strftime('%Y-%m-%d')] = dayresults
-
- return dateurls
-
- _query_headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
- }
-
- def query_for_date(ondate, cookiejar):
- u = make_search_url(ondate)
- resp = get(u, headers=_query_headers, cookies=cookiejar)
- resp.raise_for_status()
- return list(parse_results(resp.text))
-
- def make_search_url(ondate):
- params = {
- 'q': '"storrow drive" truck accident bridge OR overpass',
- 'hl': 'en',
- 'tbs': create_range_param(ondate),
- 'google_abuse': get_google_abuse_token()
- }
-
- query_part = urllib.parse.urlencode(params)
- return 'https://www.google.com/search?' + query_part
-
- def create_range_param(ondate):
- datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ')
- return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr)
-
- def parse_results(raw_html):
- soup = BeautifulSoup(raw_html, 'html.parser')
- result_block = soup.find_all('div', attrs={'class': 'g'})
- for result in result_block:
- link = result.find('a', href=True)
- title = result.find('h3')
- if link and title:
- yield (link['href'], title.text)
-
- def prep_nltk():
- nltk.download('punkt')
|