You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

searchlib.py 7.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. import os
  2. import datetime
  3. import urllib
  4. import re
  5. import http.cookiejar as cookielib
  6. import newspaper
  7. import nltk
  8. from bs4 import BeautifulSoup
  9. from requests import get
  10. from sbenv import *
  11. EXCL_DOMAINS = [
  12. '(www\.)?researchgate\.net',
  13. '(www\.)?businessyab.com',
  14. '.*twitter\.com',
  15. '.*quora.*',
  16. '.*\.ru',
  17. '.*\.jp',
  18. '.*xn--.*'
  19. ]
  20. # silly google
  21. REWRITE_DOMAINS = [
  22. ('.*\.facebook\.com', 'www.facebook.com')
  23. ]
  24. # silly google
  25. EXCL_PATH_EXTS = [
  26. '.pdf',
  27. '.docx'
  28. ]
  29. # If a page has any of these keywords then we drop it.
  30. EXCL_KWS = [
  31. # for some reason it shows personal injury lawyers
  32. 'injury',
  33. 'attorney',
  34. 'court',
  35. 'claim',
  36. # repair shops
  37. 'repair',
  38. ]
  39. # If it has any of these then it's a hard confirm
  40. PASS_KWS = [
  41. 'storrow',
  42. 'overpass',
  43. ]
  44. def process_results(rtbl):
  45. rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
  46. excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
  47. results = {}
  48. for qdate, rents in rtbl.items():
  49. results[qdate] = _process_day_results(rw_domains, excl_domains, qdate, rents)
  50. return results
  51. def process_day_results(query_date, rents):
  52. rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
  53. excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
  54. return _process_day_results(rw_domains, excl_domains, query_date, rents)
  55. def _process_day_results(rw_domains, excl_domains, qdate, rents):
  56. articles = {
  57. 'pass': [],
  58. 'maybe': [],
  59. }
  60. for rent in rents:
  61. uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])
  62. # Check skip because of domain exclusion
  63. skip_cuz_url = False
  64. for reg in excl_domains:
  65. if reg.fullmatch(unl):
  66. skip_cuz_url = True
  67. break
  68. if skip_cuz_url:
  69. continue
  70. # Rewrite the domain, if applicable
  71. for reg, rw in rw_domains:
  72. if reg.fullmatch(unl):
  73. unl = rw
  74. break
  75. # Check skip because of URL path file extension
  76. skip_cuz_pathext = False
  77. for ext in EXCL_PATH_EXTS:
  78. # Also look at the full url anyways.
  79. if upath.endswith(ext) or rent['u'].endswith(ext):
  80. skip_cuz_pathext = True
  81. break
  82. if skip_cuz_pathext:
  83. continue
  84. has_kws = 'nkw' in rent
  85. # Check skip because of hard exclusion keywords
  86. skip_cuz_kws = False
  87. if has_kws:
  88. for kw in EXCL_KWS:
  89. for akw in rent['nkw']:
  90. if kw in akw:
  91. skip_cuz_kws = True
  92. if skip_cuz_kws:
  93. break
  94. if skip_cuz_kws:
  95. continue
  96. # Now characterize what kind of entry it is.
  97. has_date = 'nd' in rent
  98. has_title = 'nt' in rent
  99. has_pass_kw = False
  100. if has_kws:
  101. for kw in PASS_KWS:
  102. for akw in rent['nkw']:
  103. if kw in akw:
  104. has_pass_kw = True
  105. if has_pass_kw:
  106. break
  107. # Try to assemble a record to store the thing.
  108. eff_date = rent['nd'] if has_date else qdate
  109. eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
  110. item = {
  111. 'slug': gen_slug(rent['t'], eff_date),
  112. 'url': eff_url,
  113. 'gtitle': rent['t'],
  114. 'title': rent['nt'] if has_title else None,
  115. 'date': eff_date,
  116. 'kws': rent['nkw'] if has_kws else None
  117. }
  118. if has_pass_kw:
  119. articles['pass'].append(item)
  120. else:
  121. articles['maybe'].append(item)
  122. return articles
  123. SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+')
  124. def gen_slug(title, date):
  125. norm_title = title.lower()
  126. title_part = '-'.join(m.group(0) for m in SLUG_CHUNKS.finditer(norm_title))
  127. return '%s-%s' % (date, title_part)
  128. def query_range(startdate, numdays, preloadurls=None):
  129. cookiejar = load_cookiejar()
  130. oneday = datetime.timedelta(days=1)
  131. seenurls = set() if preloadurls is None else set(preloadurls)
  132. dateurls = {}
  133. for i in range(numdays):
  134. d = startdate + (oneday * i)
  135. print(d)
  136. dres = _query_day_and_fetch(d, cookiejar, seenurls)
  137. dateurls[d.strftime('%Y-%m-%d')] = dres
  138. return dateurls
  139. def query_day(date, preloadurls=None):
  140. cookiejar = load_cookiejar()
  141. seenurls = set() if preloadurls is None else set(preloadurls)
  142. res = _query_day_and_fetch(date, cookiejar, seenurls)
  143. return res
  144. def _query_day_and_fetch(date, cookiejar, seenurls_mut):
  145. qresults = query_for_date(date, cookiejar)
  146. dayresults = []
  147. for rurl, rtitle in qresults:
  148. if rurl in seenurls_mut:
  149. continue
  150. seenurls_mut.add(rurl)
  151. rent = {
  152. 'u': rurl,
  153. 't': rtitle,
  154. }
  155. fa = _fetch_article(rurl, cookiejar)
  156. if fa is not None:
  157. rent.update(fa)
  158. dayresults.append(rent)
  159. return dayresults
  160. def fetch_article(url):
  161. cj = load_cookiejar()
  162. fa = _fetch_article(url, cj)
  163. if fa is None:
  164. return None
  165. fa['u'] = url
  166. if 'nt' in fa:
  167. fa['t'] = fa['nt']
  168. return fa
  169. def _fetch_article(rurl, cookiejar):
  170. rent = {}
  171. try:
  172. u = urllib.parse.urlparse(rurl)
  173. if (u.path == '/' or u.path == '') and u.params == '' and u.query == '':
  174. print('url is for website main page and has no params, probably not a news article:', rurl)
  175. return None
  176. print('processing', rurl)
  177. a = newspaper.Article(rurl)
  178. a.download()
  179. a.parse()
  180. a.nlp()
  181. rent['nt'] = a.title
  182. try:
  183. rent['nd'] = a.publish_date.strftime('%Y-%m-%d')
  184. except:
  185. pass
  186. rent['nkw'] = a.keywords
  187. except Exception as e:
  188. print(str(e))
  189. return rent
  190. _query_headers = {
  191. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
  192. }
  193. def query_for_date(ondate, cookiejar):
  194. u = make_search_url(ondate)
  195. resp = get(u, headers=_query_headers, cookies=cookiejar)
  196. resp.raise_for_status()
  197. return list(parse_results(resp.text))
  198. def make_search_url(ondate):
  199. params = {
  200. 'q': '"storrow drive" truck accident bridge OR overpass',
  201. 'hl': 'en',
  202. 'tbs': create_range_param(ondate),
  203. 'google_abuse': get_google_abuse_token()
  204. }
  205. query_part = urllib.parse.urlencode(params)
  206. return 'https://www.google.com/search?' + query_part
  207. def create_range_param(ondate):
  208. datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ')
  209. return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr)
  210. def parse_results(raw_html):
  211. soup = BeautifulSoup(raw_html, 'html.parser')
  212. result_block = soup.find_all('div', attrs={'class': 'g'})
  213. for result in result_block:
  214. link = result.find('a', href=True)
  215. title = result.find('h3')
  216. if link and title:
  217. yield (link['href'], title.text)
  218. def prep_nltk():
  219. nltk.download('punkt')