You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

searchlib.py 6.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. import os
  2. import datetime
  3. import urllib
  4. import re
  5. import http.cookiejar as cookielib
  6. import newspaper
  7. import nltk
  8. from bs4 import BeautifulSoup
  9. from requests import get
  10. from sbenv import *
  11. EXCL_DOMAINS = [
  12. '(www\.)?researchgate\.net',
  13. '(www\.)?businessyab.com',
  14. '.*twitter\.com',
  15. '.*quora.*',
  16. '.*\.ru',
  17. ]
  18. # silly google
  19. REWRITE_DOMAINS = [
  20. ('.*\.facebook\.com', 'www.facebook.com')
  21. ]
  22. # silly google
  23. EXCL_PATH_EXTS = [
  24. '.pdf',
  25. '.docx'
  26. ]
  27. # If a page has any of these keywords then we drop it.
  28. EXCL_KWS = [
  29. # for some reason it shows personal injury lawyers
  30. 'injury',
  31. 'attorney',
  32. 'court',
  33. 'claim',
  34. # repair shops
  35. 'repair',
  36. ]
  37. # If it has any of these then it's a hard confirm
  38. PASS_KWS = [
  39. 'storrow',
  40. 'overpass',
  41. ]
  42. def process_results(rtbl):
  43. rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
  44. excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
  45. results = {}
  46. for qdate, rents in rtbl.items():
  47. for rent in rents:
  48. uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])
  49. # Check skip because of domain exclusion
  50. skip_cuz_url = False
  51. for reg in excl_domains:
  52. if reg.fullmatch(unl):
  53. skip_cuz_url = True
  54. break
  55. if skip_cuz_url:
  56. continue
  57. # Rewrite the domain, if applicable
  58. for reg, rw in rw_domains:
  59. if reg.fullmatch(unl):
  60. unl = rw
  61. break
  62. # Check skip because of URL path file extension
  63. skip_cuz_pathext = False
  64. for ext in EXCL_PATH_EXTS:
  65. if upath.endswith(ext):
  66. skip_cuz_pathext = True
  67. break
  68. if skip_cuz_pathext:
  69. continue
  70. has_kws = 'nkw' in rent
  71. # Check skip because of hard exclusion keywords
  72. skip_cuz_kws = False
  73. if has_kws:
  74. for kw in EXCL_KWS:
  75. for akw in rent['nkw']:
  76. if kw in akw:
  77. skip_cuz_kws = True
  78. if skip_cuz_kws:
  79. break
  80. if skip_cuz_kws:
  81. continue
  82. # Now characterize what kind of entry it is.
  83. has_date = 'nd' in rent
  84. has_title = 'nt' in rent
  85. has_pass_kw = False
  86. if has_kws:
  87. for kw in PASS_KWS:
  88. for akw in rent['nkw']:
  89. if kw in akw:
  90. has_pass_kw = True
  91. if has_pass_kw:
  92. break
  93. # Try to assemble a record to store the thing.
  94. eff_date = rent['nd'] if has_date else qdate
  95. eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
  96. item = {
  97. 'slug': gen_slug(rent['t'], eff_date),
  98. 'url': eff_url,
  99. 'gtitle': rent['t'],
  100. 'title': rent['nt'] if has_title else None,
  101. 'date': eff_date,
  102. 'kws': rent['nkw'] if has_kws else None
  103. }
  104. if eff_date not in results:
  105. results[eff_date] = {
  106. 'pass': [],
  107. 'maybe': []
  108. }
  109. if has_pass_kw:
  110. results[eff_date]['pass'].append(item)
  111. else:
  112. results[eff_date]['maybe'].append(item)
  113. return results
  114. SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+')
  115. def gen_slug(title, date):
  116. norm_title = title.lower()
  117. title_part = '-'.join(m.group(0) for m in SLUG_CHUNKS.finditer(norm_title))
  118. return '%s-%s' % (date, title_part)
  119. def query_range(startdate, numdays, preloadurls=None):
  120. cookiejar = load_cookiejar()
  121. oneday = datetime.timedelta(days=1)
  122. seenurls = set() if preloadurls is None else set(preloadurls)
  123. dateurls = {}
  124. for i in range(numdays):
  125. d = startdate + (oneday * i)
  126. print(d)
  127. qresults = query_for_date(d, cookiejar)
  128. dayresults = []
  129. for rurl, rtitle in qresults:
  130. if rurl in seenurls:
  131. continue
  132. seenurls.add(rurl)
  133. rent = {
  134. 'u': rurl,
  135. 't': rtitle,
  136. }
  137. art = None
  138. try:
  139. u = urllib.parse.urlparse(rurl)
  140. if (u.path == '/' or u.path == '') and u.params == '':
  141. print('url is for website main page and has no params, probably not a news article:', rurl)
  142. continue
  143. print('processing', rurl)
  144. a = newspaper.Article(rurl)
  145. a.download()
  146. a.parse()
  147. a.nlp()
  148. rent['nt'] = a.title
  149. try:
  150. rent['nd'] = a.publish_date.strftime('%Y-%m-%d')
  151. except:
  152. pass
  153. rent['nkw'] = a.keywords
  154. except Exception as e:
  155. print(str(e))
  156. dayresults.append(rent)
  157. dateurls[d.strftime('%Y-%m-%d')] = dayresults
  158. return dateurls
  159. _query_headers = {
  160. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
  161. }
  162. def query_for_date(ondate, cookiejar):
  163. u = make_search_url(ondate)
  164. resp = get(u, headers=_query_headers, cookies=cookiejar)
  165. resp.raise_for_status()
  166. return list(parse_results(resp.text))
  167. def make_search_url(ondate):
  168. params = {
  169. 'q': '"storrow drive" truck accident bridge OR overpass',
  170. 'hl': 'en',
  171. 'tbs': create_range_param(ondate),
  172. 'google_abuse': get_google_abuse_token()
  173. }
  174. query_part = urllib.parse.urlencode(params)
  175. return 'https://www.google.com/search?' + query_part
  176. def create_range_param(ondate):
  177. datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ')
  178. return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr)
  179. def parse_results(raw_html):
  180. soup = BeautifulSoup(raw_html, 'html.parser')
  181. result_block = soup.find_all('div', attrs={'class': 'g'})
  182. for result in result_block:
  183. link = result.find('a', href=True)
  184. title = result.find('h3')
  185. if link and title:
  186. yield (link['href'], title.text)
  187. def prep_nltk():
  188. nltk.download('punkt')