You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

searchlib.py 6.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. import os
  2. import datetime
  3. import urllib
  4. import re
  5. import http.cookiejar as cookielib
  6. import newspaper
  7. import nltk
  8. from bs4 import BeautifulSoup
  9. from requests import get
  10. from sbenv import *
  11. EXCL_DOMAINS = [
  12. '(www\.)?researchgate\.net',
  13. '(www\.)?businessyab.com',
  14. '.*twitter\.com',
  15. '.*quora.*'
  16. ]
  17. # silly google
  18. REWRITE_DOMAINS = [
  19. ('.*\.facebook\.com', 'www.facebook.com')
  20. ]
  21. # silly google
  22. EXCL_PATH_EXTS = [
  23. '.pdf',
  24. '.docx'
  25. ]
  26. # If a page has any of these keywords then we drop it.
  27. EXCL_KWS = [
  28. # for some reason it shows personal injury lawyers
  29. 'injury',
  30. 'attorney',
  31. 'court',
  32. 'claim',
  33. # repair shops
  34. 'repair',
  35. ]
  36. # If it has any of these then it's a hard confirm
  37. PASS_KWS = [
  38. 'storrow',
  39. 'overpass',
  40. ]
  41. def process_results(rtbl):
  42. rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
  43. excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
  44. results = {}
  45. for qdate, rents in rtbl.items():
  46. for rent in rents:
  47. uscheme, unl, upath, uq, ufrag = urllib.parse.urlsplit(rent['u'])
  48. # Check skip because of domain exclusion
  49. skip_cuz_url = False
  50. for reg in excl_domains:
  51. if reg.fullmatch(unl):
  52. skip_cuz_url = True
  53. break
  54. if skip_cuz_url:
  55. continue
  56. # Rewrite the domain, if applicable
  57. for reg, rw in rw_domains:
  58. if reg.fullmatch(unl):
  59. unl = rw
  60. break
  61. # Check skip because of URL path file extension
  62. skip_cuz_pathext = False
  63. for ext in EXCL_PATH_EXTS:
  64. if upath.endswith(ext):
  65. skip_cuz_pathext = True
  66. break
  67. if skip_cuz_pathext:
  68. continue
  69. has_kws = 'nkw' in rent
  70. # Check skip because of hard exclusion keywords
  71. skip_cuz_kws = False
  72. if has_kws:
  73. for kw in EXCL_KWS:
  74. for akw in rent['nkw']:
  75. if kw in akw:
  76. skip_cuz_kws = True
  77. if skip_cuz_kws:
  78. break
  79. if skip_cuz_kws:
  80. continue
  81. # Now characterize what kind of entry it is.
  82. has_date = 'nd' in rent
  83. has_title = 'nt' in rent
  84. has_pass_kw = False
  85. if has_kws:
  86. for kw in PASS_KWS:
  87. for akw in rent['nkw']:
  88. if kw in akw:
  89. has_pass_kw = True
  90. if has_pass_kw:
  91. break
  92. # Try to assemble a record to store the thing.
  93. eff_date = rent['nd'] if has_date else qdate
  94. eff_url = urllib.parse.urlunparse([uscheme, unl, upath, uq, ufrag, ''])
  95. item = {
  96. 'slug': gen_slug(rent['t'], eff_date),
  97. 'url': eff_url,
  98. 'gtitle': rent['t'],
  99. 'title': rent['nt'] if has_title else None,
  100. 'date': eff_date,
  101. 'kws': rent['nkw'] if has_kws else None
  102. }
  103. if eff_date not in results:
  104. results[eff_date] = {
  105. 'pass': [],
  106. 'maybe': []
  107. }
  108. if has_pass_kw:
  109. results[eff_date]['pass'].append(item)
  110. else:
  111. results[eff_date]['maybe'].append(item)
  112. return results
  113. SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+')
  114. def gen_slug(title, date):
  115. norm_title = title.lower()
  116. title_part = '-'.join(m.group(0) for m in SLUG_CHUNKS.finditer(norm_title))
  117. return '%s-%s' % (date, title_part)
  118. def query_range(startdate, numdays, preloadurls=None):
  119. cookiejar = load_cookiejar()
  120. oneday = datetime.timedelta(days=1)
  121. seenurls = set() if preloadurls is None else set(preloadurls)
  122. dateurls = {}
  123. for i in range(numdays):
  124. d = startdate + (oneday * i)
  125. print(d)
  126. qresults = query_for_date(d, cookiejar)
  127. dayresults = []
  128. for rurl, rtitle in qresults:
  129. if rurl in seenurls:
  130. continue
  131. seenurls.add(rurl)
  132. rent = {
  133. 'u': rurl,
  134. 't': rtitle,
  135. }
  136. art = None
  137. try:
  138. print('processing', rurl)
  139. a = newspaper.Article(rurl)
  140. a.download()
  141. a.parse()
  142. a.nlp()
  143. rent['nt'] = a.title
  144. try:
  145. rent['nd'] = a.publish_date.strftime('%Y-%m-%d')
  146. except:
  147. pass
  148. rent['nkw'] = a.keywords
  149. except Exception as e:
  150. print(str(e))
  151. dayresults.append(rent)
  152. dateurls[d.strftime('%Y-%m-%d')] = dayresults
  153. return dateurls
  154. _query_headers = {
  155. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
  156. }
  157. def query_for_date(ondate, cookiejar):
  158. u = make_search_url(ondate)
  159. resp = get(u, headers=_query_headers, cookies=cookiejar)
  160. resp.raise_for_status()
  161. return list(parse_results(resp.text))
  162. def make_search_url(ondate):
  163. params = {
  164. 'q': '"storrow drive" truck accident bridge OR overpass',
  165. 'hl': 'en',
  166. 'tbs': create_range_param(ondate),
  167. 'google_abuse': get_google_abuse_token()
  168. }
  169. query_part = urllib.parse.urlencode(params)
  170. return 'https://www.google.com/search?' + query_part
  171. def create_range_param(ondate):
  172. datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ')
  173. return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr)
  174. def parse_results(raw_html):
  175. soup = BeautifulSoup(raw_html, 'html.parser')
  176. result_block = soup.find_all('div', attrs={'class': 'g'})
  177. for result in result_block:
  178. link = result.find('a', href=True)
  179. title = result.find('h3')
  180. if link and title:
  181. yield (link['href'], title.text)
  182. def prep_nltk():
  183. nltk.download('punkt')