You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

searchlib.py 7.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. import os
  2. import datetime
  3. import urllib
  4. import re
  5. import http.cookiejar as cookielib
  6. import newspaper
  7. import nltk
  8. from bs4 import BeautifulSoup
  9. from requests import get
  10. from sbenv import *
  11. EXCL_DOMAINS = [
  12. '(www\.)?researchgate\.net',
  13. '(www\.)?businessyab.com',
  14. '.*twitter\.com',
  15. '.*quora.*',
  16. '.*\.ru',
  17. ]
  18. # silly google
  19. REWRITE_DOMAINS = [
  20. ('.*\.facebook\.com', 'www.facebook.com')
  21. ]
  22. # silly google
  23. EXCL_PATH_EXTS = [
  24. '.pdf',
  25. '.docx'
  26. ]
  27. # If a page has any of these keywords then we drop it.
  28. EXCL_KWS = [
  29. # for some reason it shows personal injury lawyers
  30. 'injury',
  31. 'attorney',
  32. 'court',
  33. 'claim',
  34. # repair shops
  35. 'repair',
  36. ]
  37. # If it has any of these then it's a hard confirm
  38. PASS_KWS = [
  39. 'storrow',
  40. 'overpass',
  41. ]
  42. def process_results(rtbl):
  43. rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
  44. excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
  45. results = {}
  46. for qdate, rents in rtbl.items():
  47. results[qdate] = _process_day_result(rw_domains, excl_domains, qdate, rents)
  48. return results
  49. def process_day_results(query_date, rents):
  50. rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
  51. excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
  52. return _process_day_results(rw_domains, excl_domains, query_date, rents)
  53. def _process_day_results(rw_domains, excl_domains, qdate, rents):
  54. articles = {
  55. 'pass': [],
  56. 'maybe': [],
  57. }
  58. for rent in rents:
  59. uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])
  60. # Check skip because of domain exclusion
  61. skip_cuz_url = False
  62. for reg in excl_domains:
  63. if reg.fullmatch(unl):
  64. skip_cuz_url = True
  65. break
  66. if skip_cuz_url:
  67. continue
  68. # Rewrite the domain, if applicable
  69. for reg, rw in rw_domains:
  70. if reg.fullmatch(unl):
  71. unl = rw
  72. break
  73. # Check skip because of URL path file extension
  74. skip_cuz_pathext = False
  75. for ext in EXCL_PATH_EXTS:
  76. if upath.endswith(ext):
  77. skip_cuz_pathext = True
  78. break
  79. if skip_cuz_pathext:
  80. continue
  81. has_kws = 'nkw' in rent
  82. # Check skip because of hard exclusion keywords
  83. skip_cuz_kws = False
  84. if has_kws:
  85. for kw in EXCL_KWS:
  86. for akw in rent['nkw']:
  87. if kw in akw:
  88. skip_cuz_kws = True
  89. if skip_cuz_kws:
  90. break
  91. if skip_cuz_kws:
  92. continue
  93. # Now characterize what kind of entry it is.
  94. has_date = 'nd' in rent
  95. has_title = 'nt' in rent
  96. has_pass_kw = False
  97. if has_kws:
  98. for kw in PASS_KWS:
  99. for akw in rent['nkw']:
  100. if kw in akw:
  101. has_pass_kw = True
  102. if has_pass_kw:
  103. break
  104. # Try to assemble a record to store the thing.
  105. eff_date = rent['nd'] if has_date else qdate
  106. eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
  107. item = {
  108. 'slug': gen_slug(rent['t'], eff_date),
  109. 'url': eff_url,
  110. 'gtitle': rent['t'],
  111. 'title': rent['nt'] if has_title else None,
  112. 'date': eff_date,
  113. 'kws': rent['nkw'] if has_kws else None
  114. }
  115. if has_pass_kw:
  116. articles['pass'].append(item)
  117. else:
  118. articles['maybe'].append(item)
  119. return articles
  120. SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+')
  121. def gen_slug(title, date):
  122. norm_title = title.lower()
  123. title_part = '-'.join(m.group(0) for m in SLUG_CHUNKS.finditer(norm_title))
  124. return '%s-%s' % (date, title_part)
  125. def query_range(startdate, numdays, preloadurls=None):
  126. cookiejar = load_cookiejar()
  127. oneday = datetime.timedelta(days=1)
  128. seenurls = set() if preloadurls is None else set(preloadurls)
  129. dateurls = {}
  130. for i in range(numdays):
  131. d = startdate + (oneday * i)
  132. print(d)
  133. dres = _query_day_and_fetch(d, cookiejar, seenurls)
  134. dateurls[d.strftime('%Y-%m-%d')] = dres
  135. return dateurls
  136. def query_day(date, preloadurls=None):
  137. cookiejar = load_cookiejar()
  138. seenurls = set() if preloadurls is None else set(preloadurls)
  139. res = _query_day_and_fetch(date, cookiejar, seenurls)
  140. return res
  141. def _query_day_and_fetch(date, cookiejar, seenurls_mut):
  142. qresults = query_for_date(date, cookiejar)
  143. dayresults = []
  144. for rurl, rtitle in qresults:
  145. if rurl in seenurls_mut:
  146. continue
  147. seenurls_mut.add(rurl)
  148. rent = {
  149. 'u': rurl,
  150. 't': rtitle,
  151. }
  152. fa = _fetch_article(rurl, cookiejar)
  153. if fa is not None:
  154. rent.update(fa)
  155. dayresults.append(rent)
  156. return dayresults
  157. def fetch_article(url):
  158. cj = load_cookiejar()
  159. fa = _fetch_article(url, cj)
  160. if fa is None:
  161. return None
  162. fa['u'] = url
  163. if 'nt' in fa:
  164. fa['t'] = fa['nt']
  165. return fa
  166. def _fetch_article(rurl, cookiejar):
  167. rent = {}
  168. try:
  169. u = urllib.parse.urlparse(rurl)
  170. if (u.path == '/' or u.path == '') and u.params == '':
  171. print('url is for website main page and has no params, probably not a news article:', rurl)
  172. return None
  173. print('processing', rurl)
  174. a = newspaper.Article(rurl)
  175. a.download()
  176. a.parse()
  177. a.nlp()
  178. rent['nt'] = a.title
  179. try:
  180. rent['nd'] = a.publish_date.strftime('%Y-%m-%d')
  181. except:
  182. pass
  183. rent['nkw'] = a.keywords
  184. except Exception as e:
  185. print(str(e))
  186. return rent
  187. _query_headers = {
  188. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
  189. }
  190. def query_for_date(ondate, cookiejar):
  191. u = make_search_url(ondate)
  192. resp = get(u, headers=_query_headers, cookies=cookiejar)
  193. resp.raise_for_status()
  194. return list(parse_results(resp.text))
  195. def make_search_url(ondate):
  196. params = {
  197. 'q': '"storrow drive" truck accident bridge OR overpass',
  198. 'hl': 'en',
  199. 'tbs': create_range_param(ondate),
  200. 'google_abuse': get_google_abuse_token()
  201. }
  202. query_part = urllib.parse.urlencode(params)
  203. return 'https://www.google.com/search?' + query_part
  204. def create_range_param(ondate):
  205. datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ')
  206. return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr)
  207. def parse_results(raw_html):
  208. soup = BeautifulSoup(raw_html, 'html.parser')
  209. result_block = soup.find_all('div', attrs={'class': 'g'})
  210. for result in result_block:
  211. link = result.find('a', href=True)
  212. title = result.find('h3')
  213. if link and title:
  214. yield (link['href'], title.text)
  215. def prep_nltk():
  216. nltk.download('punkt')