You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

searchlib.py 3.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import os
  2. import datetime
  3. import urllib
  4. import http.cookiejar as cookielib
  5. import newspaper
  6. import nltk
  7. from bs4 import BeautifulSoup
  8. from requests import get
  9. from sbenv import *
  10. EXCL_DOMAINS = [
  11. '(www\.)?researchgate\.net',
  12. '(www\.)?businessyab.com'
  13. ]
  14. # silly google
  15. REWRITE_DOMAINS = [
  16. ('.*\.facebook\.com', 'www.facebook.com')
  17. ]
  18. # silly google
  19. EXCL_PATH_EXTS = [
  20. '.pdf',
  21. '.docx'
  22. ]
  23. # If a page has any of these keywords then we drop it.
  24. EXCL_KWS = [
  25. # for some reason it shows personal injury lawyers
  26. 'injury',
  27. 'attorney',
  28. 'court',
  29. 'claim',
  30. # repair shops
  31. 'repair',
  32. ]
  33. # If it has any of these then it's a hard confirm
  34. PASS_KW = [
  35. 'storrow',
  36. 'storrowed',
  37. 'overpass',
  38. 'bridge',
  39. ]
  40. def query_range(startdate, numdays):
  41. cookiejar = load_cookiejar()
  42. oneday = datetime.timedelta(days=1)
  43. seenurls = set()
  44. dateurls = {}
  45. for i in range(numdays):
  46. d = startdate + (oneday * i)
  47. print(d)
  48. qresults = query_for_date(d, cookiejar)
  49. dayresults = []
  50. for rurl, rtitle in qresults:
  51. if rurl in seenurls:
  52. continue
  53. seenurls.add(rurl)
  54. rent = {
  55. 'u': rurl,
  56. 't': rtitle,
  57. }
  58. art = None
  59. try:
  60. print('processing', rurl)
  61. a = newspaper.Article(rurl)
  62. a.download()
  63. a.parse()
  64. a.nlp()
  65. rent['nt'] = a.title
  66. try:
  67. rent['nd'] = a.publish_date.strftime('%Y-%m-%d')
  68. except:
  69. pass
  70. rent['nkw'] = a.keywords
  71. except Exception as e:
  72. print(str(e))
  73. dayresults.append(rent)
  74. dateurls[d.strftime('%Y-%m-%d')] = dayresults
  75. return dateurls
  76. _query_headers = {
  77. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
  78. }
  79. def query_for_date(ondate, cookiejar):
  80. u = make_search_url(ondate)
  81. resp = get(u, headers=_query_headers, cookies=cookiejar)
  82. resp.raise_for_status()
  83. return list(parse_results(resp.text))
  84. def make_search_url(ondate):
  85. params = {
  86. 'q': '"storrow drive" truck accident bridge OR overpass',
  87. 'hl': 'en',
  88. 'tbs': create_range_param(ondate),
  89. 'google_abuse': get_google_abuse_token()
  90. }
  91. query_part = urllib.parse.urlencode(params)
  92. return 'https://www.google.com/search?' + query_part
  93. def create_range_param(ondate):
  94. datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ')
  95. return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr)
  96. def parse_results(raw_html):
  97. soup = BeautifulSoup(raw_html, 'html.parser')
  98. result_block = soup.find_all('div', attrs={'class': 'g'})
  99. for result in result_block:
  100. link = result.find('a', href=True)
  101. title = result.find('h3')
  102. if link and title:
  103. yield (link['href'], title.text)
  104. def prep_nltk():
  105. nltk.download('punkt')