You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

searchlib.py 6.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. import os
  2. import datetime
  3. import urllib
  4. import re
  5. import http.cookiejar as cookielib
  6. import newspaper
  7. import nltk
  8. from bs4 import BeautifulSoup
  9. from requests import get
  10. from sbenv import *
  11. EXCL_DOMAINS = [
  12. '(www\.)?researchgate\.net',
  13. '(www\.)?businessyab.com',
  14. '.*twitter\.com',
  15. '.*quora.*',
  16. '.*\.ru',
  17. ]
  18. # silly google
  19. REWRITE_DOMAINS = [
  20. ('.*\.facebook\.com', 'www.facebook.com')
  21. ]
  22. # silly google
  23. EXCL_PATH_EXTS = [
  24. '.pdf',
  25. '.docx'
  26. ]
  27. # If a page has any of these keywords then we drop it.
  28. EXCL_KWS = [
  29. # for some reason it shows personal injury lawyers
  30. 'injury',
  31. 'attorney',
  32. 'court',
  33. 'claim',
  34. # repair shops
  35. 'repair',
  36. ]
  37. # If it has any of these then it's a hard confirm
  38. PASS_KWS = [
  39. 'storrow',
  40. 'overpass',
  41. ]
  42. def process_results(rtbl):
  43. rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
  44. excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
  45. results = {}
  46. for qdate, rents in rtbl.items():
  47. results[qdate] = _process_day_result(rw_domains, excl_domains, qdate, rents)
  48. return results
  49. def process_day_results(query_date, rents):
  50. rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
  51. excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
  52. return _process_day_results(rw_domains, excl_domains, query_date, rents)
  53. def _process_day_results(rw_domains, excl_domains, qdate, rents):
  54. articles = {
  55. 'pass': [],
  56. 'maybe': [],
  57. }
  58. for rent in rents:
  59. uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])
  60. # Check skip because of domain exclusion
  61. skip_cuz_url = False
  62. for reg in excl_domains:
  63. if reg.fullmatch(unl):
  64. skip_cuz_url = True
  65. break
  66. if skip_cuz_url:
  67. continue
  68. # Rewrite the domain, if applicable
  69. for reg, rw in rw_domains:
  70. if reg.fullmatch(unl):
  71. unl = rw
  72. break
  73. # Check skip because of URL path file extension
  74. skip_cuz_pathext = False
  75. for ext in EXCL_PATH_EXTS:
  76. if upath.endswith(ext):
  77. skip_cuz_pathext = True
  78. break
  79. if skip_cuz_pathext:
  80. continue
  81. has_kws = 'nkw' in rent
  82. # Check skip because of hard exclusion keywords
  83. skip_cuz_kws = False
  84. if has_kws:
  85. for kw in EXCL_KWS:
  86. for akw in rent['nkw']:
  87. if kw in akw:
  88. skip_cuz_kws = True
  89. if skip_cuz_kws:
  90. break
  91. if skip_cuz_kws:
  92. continue
  93. # Now characterize what kind of entry it is.
  94. has_date = 'nd' in rent
  95. has_title = 'nt' in rent
  96. has_pass_kw = False
  97. if has_kws:
  98. for kw in PASS_KWS:
  99. for akw in rent['nkw']:
  100. if kw in akw:
  101. has_pass_kw = True
  102. if has_pass_kw:
  103. break
  104. # Try to assemble a record to store the thing.
  105. eff_date = rent['nd'] if has_date else qdate
  106. eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
  107. item = {
  108. 'slug': gen_slug(rent['t'], eff_date),
  109. 'url': eff_url,
  110. 'gtitle': rent['t'],
  111. 'title': rent['nt'] if has_title else None,
  112. 'date': eff_date,
  113. 'kws': rent['nkw'] if has_kws else None
  114. }
  115. if has_pass_kw:
  116. articles['pass'].append(item)
  117. else:
  118. articles['maybe'].append(item)
  119. return articles
  120. SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+')
  121. def gen_slug(title, date):
  122. norm_title = title.lower()
  123. title_part = '-'.join(m.group(0) for m in SLUG_CHUNKS.finditer(norm_title))
  124. return '%s-%s' % (date, title_part)
  125. def query_range(startdate, numdays, preloadurls=None):
  126. cookiejar = load_cookiejar()
  127. oneday = datetime.timedelta(days=1)
  128. seenurls = set() if preloadurls is None else set(preloadurls)
  129. dateurls = {}
  130. for i in range(numdays):
  131. d = startdate + (oneday * i)
  132. print(d)
  133. qresults = query_for_date(d, cookiejar)
  134. dayresults = []
  135. for rurl, rtitle in qresults:
  136. if rurl in seenurls:
  137. continue
  138. seenurls.add(rurl)
  139. rent = {
  140. 'u': rurl,
  141. 't': rtitle,
  142. }
  143. art = None
  144. try:
  145. u = urllib.parse.urlparse(rurl)
  146. if (u.path == '/' or u.path == '') and u.params == '':
  147. print('url is for website main page and has no params, probably not a news article:', rurl)
  148. continue
  149. print('processing', rurl)
  150. a = newspaper.Article(rurl)
  151. a.download()
  152. a.parse()
  153. a.nlp()
  154. rent['nt'] = a.title
  155. try:
  156. rent['nd'] = a.publish_date.strftime('%Y-%m-%d')
  157. except:
  158. pass
  159. rent['nkw'] = a.keywords
  160. except Exception as e:
  161. print(str(e))
  162. dayresults.append(rent)
  163. dateurls[d.strftime('%Y-%m-%d')] = dayresults
  164. return dateurls
  165. _query_headers = {
  166. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
  167. }
  168. def query_for_date(ondate, cookiejar):
  169. u = make_search_url(ondate)
  170. resp = get(u, headers=_query_headers, cookies=cookiejar)
  171. resp.raise_for_status()
  172. return list(parse_results(resp.text))
  173. def make_search_url(ondate):
  174. params = {
  175. 'q': '"storrow drive" truck accident bridge OR overpass',
  176. 'hl': 'en',
  177. 'tbs': create_range_param(ondate),
  178. 'google_abuse': get_google_abuse_token()
  179. }
  180. query_part = urllib.parse.urlencode(params)
  181. return 'https://www.google.com/search?' + query_part
  182. def create_range_param(ondate):
  183. datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ')
  184. return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr)
  185. def parse_results(raw_html):
  186. soup = BeautifulSoup(raw_html, 'html.parser')
  187. result_block = soup.find_all('div', attrs={'class': 'g'})
  188. for result in result_block:
  189. link = result.find('a', href=True)
  190. title = result.find('h3')
  191. if link and title:
  192. yield (link['href'], title.text)
  193. def prep_nltk():
  194. nltk.download('punkt')