Browse Source

Added article processing and filtering.

master
Trey Del Bonis 2 years ago
parent
commit
e772957f1b
3 changed files with 918 additions and 6 deletions
  1. 99
    6
      searchlib.py
  2. 13
    0
      testprocess.py
  3. 806
    0
      testresults.json

+ 99
- 6
searchlib.py View File

@@ -1,6 +1,7 @@
import os
import datetime
import urllib
import re

import http.cookiejar as cookielib
import newspaper
@@ -12,7 +13,9 @@ from sbenv import *

EXCL_DOMAINS = [
'(www\.)?researchgate\.net',
'(www\.)?businessyab.com'
'(www\.)?businessyab.com',
'.*twitter\.com',
'.*quora.*'
]

# silly google
@@ -39,18 +42,108 @@ EXCL_KWS = [
]

# If it has any of these then it's a hard confirm
PASS_KW = [
PASS_KWS = [
'storrow',
'storrowed',
'overpass',
'bridge',
]

def query_range(startdate, numdays):
def process_results(rtbl):
rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]

results = {}

for qdate, rents in rtbl.items():
for rent in rents:
uscheme, unl, upath, uq, ufrag = urllib.parse.urlsplit(rent['u'])

# Check skip because of domain exclusion
skip_cuz_url = False
for reg in excl_domains:
if reg.fullmatch(unl):
skip_cuz_url = True
break
if skip_cuz_url:
continue

# Rewrite the domain, if applicable
for reg, rw in rw_domains:
if reg.fullmatch(unl):
unl = rw
break

# Check skip because of URL path file extension
skip_cuz_pathext = False
for ext in EXCL_PATH_EXTS:
if upath.endswith(ext):
skip_cuz_pathext = True
break
if skip_cuz_pathext:
continue

has_kws = 'nkw' in rent

# Check skip because of hard exclusion keywords
skip_cuz_kws = False
if has_kws:
for kw in EXCL_KWS:
for akw in rent['nkw']:
if kw in akw:
skip_cuz_kws = True
if skip_cuz_kws:
break
if skip_cuz_kws:
continue

# Now characterize what kind of entry it is.
has_date = 'nd' in rent
has_title = 'nt' in rent
has_pass_kw = False
if has_kws:
for kw in PASS_KWS:
for akw in rent['nkw']:
if kw in akw:
has_pass_kw = True
if has_pass_kw:
break

# Try to assemble a record to store the thing.
eff_date = rent['nd'] if has_date else qdate
eff_url = urllib.parse.urlunparse([uscheme, unl, upath, uq, ufrag, ''])
item = {
'slug': gen_slug(rent['t'], eff_date),
'url': eff_url,
'gtitle': rent['t'],
'title': rent['nt'] if has_title else None,
'date': eff_date,
'kws': rent['nkw'] if has_kws else None
}

if eff_date not in results:
results[eff_date] = {
'pass': [],
'maybe': []
}

if has_pass_kw:
results[eff_date]['pass'].append(item)
else:
results[eff_date]['maybe'].append(item)

return results

SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+')

def gen_slug(title, date):
norm_title = title.lower()
title_part = '-'.join(m.group(0) for m in SLUG_CHUNKS.finditer(norm_title))
return '%s-%s' % (date, title_part)

def query_range(startdate, numdays, preloadurls=None):
cookiejar = load_cookiejar()

oneday = datetime.timedelta(days=1)
seenurls = set()
seenurls = set() if preloadurls is None else set(preloadurls)

dateurls = {}


+ 13
- 0
testprocess.py View File

@@ -0,0 +1,13 @@
#!/usr/bin/env python3

import sys
import json

import searchlib

if __name__ == '__main__':
res = None
with open(sys.argv[1], 'r') as f:
res = json.load(f)
proced = searchlib.process_results(res)
print(json.dumps(proced, indent=' '))

+ 806
- 0
testresults.json View File

@@ -0,0 +1,806 @@
{
"2021-08-02": [
{
"u": "https://www.bostonglobe.com/metro/transportation/",
"t": "Transportation - The Boston Globe",
"nt": "The Boston Globe",
"nkw": [
"spreads",
"rush",
"mbta",
"globe",
"ride",
"safe",
"variant",
"transit",
"workers",
"boston",
"week",
"vaccinated"
]
}
],
"2021-08-03": [
{
"u": "https://www.quora.com/What-do-truck-drivers-do-if-they-encounter-a-bridge-they-cant-fit-under",
"t": "What do truck drivers do if they encounter a bridge they can't fit ...",
"nt": "What do truck drivers do if they encounter a bridge they can't fit under?",
"nkw": [
"encounter",
"truck",
"moment",
"cant",
"againtry",
"bridge",
"drivers",
"try",
"went",
"wait",
"fit",
"wrong"
]
},
{
"u": "https://www.facebook.com/MethuenPolice/posts",
"t": "Methuen Police Department - Posts | Facebook",
"nt": "Methuen Police Department",
"nkw": [
"leave",
"help",
"increase",
"valuables",
"vehicle",
"viewhelp",
"methuen",
"utilize",
"weeks",
"vehicles",
"department",
"target"
]
},
{
"u": "https://m.facebook.com/ReverePoliceDept/",
"t": "Revere Police - Home | Facebook",
"nt": "Revere Police",
"nkw": [
"safetythank",
"booksnational",
"community",
"partners",
"bridge",
"2021",
"way",
"revere",
"public",
"night",
"national"
]
},
{
"u": "https://www.columbus.gov/workarea/downloadasset.aspx?id=68368",
"t": "ACTIVE LICENSED CONTRACTORS BY BUSINESS NAME",
"nt": "",
"nkw": []
},
{
"u": "https://www.newburyportnews.com/news/local_news/newbury-conservation-commission-to-review-pearson-drive-project/article_317642ee-f4d1-11eb-b12d-4769112d26c2.html",
"t": "Newbury Conservation Commission to review Pearson Drive ...",
"nt": "Newbury Conservation Commission to review Pearson Drive project",
"nkw": [
"rita",
"worked",
"western",
"resident",
"review",
"shipyard",
"wwii",
"passed",
"peacefully",
"project",
"newbury",
"conservation",
"seabrook",
"commission",
"portsmouth",
"pearson",
"drive"
]
},
{
"u": "https://www.infineon.com/",
"t": "Infineon Technologies: Semiconductor & System Solutions",
"nt": "Semiconductor & System Solutions",
"nkw": [
"experts",
"infineons",
"semiconductor",
"application",
"system",
"trends",
"shaping",
"learn",
"global",
"tech",
"solutions"
]
},
{
"u": "https://www.walmart.com/sitemap_ip_pub904_entertainment.xml.gz",
"t": "https://www.walmart.com/ip/Regulacion-de-La-Sintesis-de ...",
"nt": "",
"nkw": []
},
{
"u": "https://www.mechanicadvisor.com/ga/marietta/ford-repair-shops",
"t": "10 Best Ford Repair Shops Marietta, GA - Mechanic Advisor",
"nt": "10 Best Ford Repair Shops Marietta, GA",
"nkw": [
"car",
"fuse",
"read",
"needed",
"shops",
"told",
"going",
"best",
"service",
"center",
"repair",
"marietta",
"box",
"ordered",
"ga",
"ford"
]
},
{
"u": "https://www.mechanicadvisor.com/ga/atlanta/nissan-repair-shops",
"t": "10 Best Nissan Repair Shops Atlanta, GA - Mechanic Advisor",
"nt": "10 Best Nissan Repair Shops Atlanta, GA",
"nkw": [
"atlanta",
"car",
"fuse",
"nissan",
"read",
"needed",
"shops",
"told",
"going",
"best",
"service",
"center",
"ordered",
"box",
"ga",
"repair"
]
},
{
"u": "https://library.georgiancollege.ca/c.php?g=720607&p=5151225",
"t": "AI Trends - Artificial Intelligence - Georgian College Library"
}
],
"2021-08-04": [
{
"u": "https://boston.cbslocal.com/2021/08/04/storrow-drive-over-height-truck/",
"t": "Moving Truck 'Storrowed' Near Mass General, Causing Traffic ...",
"nt": "Moving Truck \u2018Storrowed\u2019 Near Mass General, Causing Traffic Delays",
"nd": "2021-08-04",
"nkw": [
"pick",
"truck",
"storrowed",
"traffic",
"juhann",
"celtics",
"near",
"revs",
"kevin",
"mass",
"general",
"causing",
"williams",
"secondround",
"helped",
"stock",
"returning",
"moving",
"delays"
]
},
{
"u": "https://whdh.com/news/trucks-roof-sheered-off-after-striking-overpass-on-storrow-drive-in-boston/",
"t": "Truck's roof sheered off after striking overpass on Storrow ...",
"nt": "Truck\u2019s roof sheered off after striking overpass on Storrow Drive in Boston",
"nd": "2021-08-04",
"nkw": [
"moving",
"truck",
"striking",
"drive",
"connection",
"overpass",
"work",
"way",
"boston",
"roof",
"trucks",
"whdh",
"television",
"sheered",
"storrow"
]
},
{
"u": "https://www.wcvb.com/article/truck-roof-sheared-almost-completely-off-after-striking-overpass-long-bostons-storrow-drive/37222735",
"t": "Truck roof sheared almost completely off in 'Storrowing'",
"nt": "Truck roof sheared almost completely off after striking overpass long Boston's Storrow Drive",
"nd": "2021-08-04",
"nkw": [
"west",
"truck",
"welcome",
"striking",
"drive",
"sheared",
"overpass",
"bostons",
"despite",
"used",
"roof",
"completely",
"long",
"storrow"
]
},
{
"u": "https://www.reddit.com/r/11foot8/",
"t": "r/11foot8 - Reddit",
"nt": "11Foot8",
"nkw": [
"11foot8"
]
},
{
"u": "https://www.reddit.com/r/boston/comments/oxxq7d/another_storrow_victim/",
"t": "Another Storrow Victim: boston - Reddit",
"nt": "Another Storrow Victim : boston",
"nkw": [
"victim",
"rboston",
"mark",
"press",
"shortcutssearch",
"keyboard",
"learn",
"j",
"boston",
"jump",
"rest",
"question",
"storrow"
]
},
{
"u": "https://twitter.com/pamelabump?lang=en",
"t": "Pamela Bump (@PamelaBump) | Twitter",
"nt": "",
"nkw": [
"help",
"disabled",
"supported",
"using",
"twittercom",
"javascript",
"browser",
"list",
"enable",
"switch"
]
},
{
"u": "https://www.yahoo.com/entertainment/moving-truck-storrowed-near-mass-212556728.html",
"t": "Moving Truck 'Storrowed' Near Mass General ... - Yahoo News",
"nt": "Moving Truck 'Storrowed' Near Mass General, Causing Traffic Delays",
"nkw": [
"truck",
"storrowed",
"traffic",
"technician",
"summer",
"spark",
"trade",
"near",
"taxes",
"chevrolet",
"tick",
"mass",
"general",
"causing",
"turned",
"moving",
"delays",
"street",
"corvette"
]
}
],
"2021-08-05": [
{
"u": "https://www.wcvb.com/article/oversized-truck-trailer-destroyed-after-striking-overpass-on-memorial-drive/37232211",
"t": "Oversized truck's trailer destroyed after striking overpass on ...",
"nt": "Oversized truck's trailer destroyed after striking overpass on Memorial Drive",
"nd": "2021-08-05",
"nkw": [
"white",
"memorial",
"truck",
"box",
"striking",
"injuries",
"oversized",
"trailer",
"cambridge",
"overpass",
"destroyed",
"trucks",
"clearance",
"traveling",
"drive"
]
},
{
"u": "https://fy-nl.facebook.com/wcvb5/videos/storrowed-truck-crashes-into-bridge-on-bostons-storrow-drive/2117177705251336/",
"t": "Truck crashes into bridge on Boston's Storrow Drive | Facebook",
"nt": "WCVB Channel 5 Boston - Storrowed! Truck crashes into bridge on Boston's Storrow Drive",
"nkw": [
"ive",
"truck",
"wcvb",
"saved",
"storrowed",
"dog",
"channel",
"bridge",
"life",
"military",
"service",
"boston",
"bostons",
"think",
"crashes",
"drive",
"veteran",
"storrow"
]
},
{
"u": "https://whdh.com/news/oversized-truck-destroyed-after-colliding-with-bridge-in-cambridge/",
"t": "Oversized truck destroyed after colliding with bridge ... - WHDH",
"nt": "Oversized truck destroyed after colliding with bridge in Cambridge",
"nd": "2021-08-05",
"nkw": [
"mass",
"colliding",
"truck",
"oversized",
"farm",
"cambridge",
"bridge",
"harvard",
"destroyed",
"oyster",
"scene",
"box",
"struck"
]
},
{
"u": "https://www.mvtimes.com/2021/08/05/marthas-vineyard-oyster-truck-shucked-bridge/",
"t": "Martha's Vineyard oyster truck shucked by bridge",
"nt": "Martha\u2019s Vineyard oyster truck shucked by bridge",
"nd": "2021-08-05",
"nkw": [
"signature",
"vineyard",
"marthas",
"shucked",
"truck",
"bridge",
"oysters",
"injured",
"driver",
"oyster",
"drive",
"crash",
"sullivan",
"storrow"
]
}
],
"2021-08-06": [
{
"u": "https://www.reddit.com/r/boston/comments/oytp8l/heavy_police_presence_bu_central_after_someone/",
"t": "Heavy police presence BU central after someone jumped off ...",
"nt": "Heavy police presence BU central after someone jumped off the bridge onto 90 Westbound. : boston",
"nkw": [
"rboston",
"mark",
"jumped",
"90",
"j",
"jump",
"heavy",
"westbound",
"press",
"learn",
"boston",
"central",
"shortcutssearch",
"bridge",
"keyboard",
"question",
"presence",
"rest",
"bu"
]
},
{
"u": "https://www.facebook.com/TheHarvardPress/posts/breaking-newsresident-killed-in-single-car-accident-on-still-river-roadby-john-o/10158280960236778/",
"t": "Breaking news Resident killed in... - The Harvard Press",
"nt": "Log into Facebook",
"nkw": [
"continue",
"noticeyou",
"log",
"facebook"
]
},
{
"u": "https://www.facebook.com/TheHarvardPress/posts/mccurdy-track-and-harvard-park-at-34-lancaster-county-road-was-the-site-of-a-flu/10157780958241778/",
"t": "McCurdy Track and Harvard Park at 34... - The Harvard Press",
"nt": "Log into Facebook",
"nkw": [
"continue",
"noticeyou",
"log",
"facebook"
]
},
{
"u": "https://www.bldup.com/projects/harrington-park-condo",
"t": "Harrington Park Condo Construction Projects - Lexington, MA",
"nt": "Harrington Park Condo Construction Projects - Lexington, MA",
"nkw": [
"architecture",
"construction",
"architects",
"ma",
"realty",
"associates",
"design",
"lexington",
"development",
"partners",
"harrington",
"park",
"projects",
"llc",
"condo",
"group"
]
},
{
"u": "https://blog.bestamericanpoetry.com/the_best_american_poetry/interviews/",
"t": "Interviews (285) - The Best American Poetry",
"nt": "The Best American Poetry",
"nkw": [
"american",
"poem",
"wrote",
"writing",
"ears",
"room",
"best",
"prose",
"book",
"poets",
"poems",
"poetry"
]
},
{
"u": "https://northendregionalreview.com/wp-content/uploads/2021/08/NER0819.pdf",
"t": "Candidate for Mayor of Boston - North End Regional Review",
"nt": "",
"nkw": []
},
{
"u": "https://www.newsbreak.com/news/2333388882804/fire-burns-on-site-of-pentucket-regional-high-school",
"t": "Fire Burns on Site of Pentucket Regional High School",
"nt": "Fire Burns on Site of Pentucket Regional High School",
"nkw": [
"scheduled",
"regional",
"burns",
"vandalized",
"site",
"stones",
"headstones",
"farmington",
"volunteers",
"high",
"trustees",
"school",
"went",
"crew",
"pentucket",
"cemetery"
]
},
{
"u": "https://www.businessyab.com/explore/united_states/rhode_island/providence_county/woonsocket/foundry_street/45/cousins-collision-center-401-766-7679.html",
"t": "Cousin's Collision Center, 45 Foundry St, Woonsocket, RI ...",
"nt": "Cousin's Collision Center, 45 Foundry St, Woonsocket, RI 02895, USA",
"nd": "2019-10-02",
"nkw": [
"collision",
"woonsocket",
"phone",
"ri",
"street",
"center",
"45",
"usa",
"repair",
"st",
"cousins",
"foundry",
"02895"
]
},
{
"u": "https://www.businessyab.com/explore/united_states/new_hampshire/hillsborough_county/pelham/bridge_street/503/salem-66-auto-sales-603-635-3222.html",
"t": "Salem 66 Auto Sales - 503 Bridge St, Pelham, NH...",
"nt": "Salem 66 Auto Sales, 503 Bridge St, Pelham, NH 03076, USA",
"nd": "2019-08-01",
"nkw": [
"66",
"sales",
"pelham",
"auto",
"503",
"phone",
"number",
"bridge",
"street",
"nh",
"usa",
"salem",
"st"
]
}
],
"2021-08-07": [
{
"u": "https://www.reddit.com/r/boston/comments/ozxj36/storrow_drive_west_closed_from_fenway_to_the_bu/",
"t": "Storrow Drive West closed from Fenway to the BU Bridge so ...",
"nt": "Storrow Drive West closed from Fenway to the BU Bridge so this guy can back up : boston",
"nkw": [
"rboston",
"mark",
"fenway",
"j",
"jump",
"storrow",
"press",
"learn",
"boston",
"west",
"shortcutssearch",
"bridge",
"keyboard",
"guy",
"drive",
"question",
"closed",
"rest",
"bu"
]
},
{
"u": "https://www.reddit.com/r/boston/comments/ozxks2/update_backwards_storrowed_while_reversing_under/",
"t": "Backwards Storrowed while reversing under the Silber way ...",
"nt": "Update \" Backwards Storrowed while reversing under the Silber way bridge. Ended up ripping off his antennas and navigation equipment. This dude's setting records today! : boston",
"nkw": [
"today",
"navigation",
"records",
"update",
"setting",
"storrowed",
"ripping",
"reversing",
"way",
"silber"
]
},
{
"u": "https://twitter.com/nicksair?lang=en",
"t": "Nicholas William Reed (@nicksair) | Twitter",
"nt": "",
"nkw": [
"help",
"disabled",
"supported",
"using",
"twittercom",
"javascript",
"browser",
"list",
"enable",
"switch"
]
},
{
"u": "https://www.universalhub.com/2021/man-boat-narrowly-avoids-fate-man-nantucket-and",
"t": "Florida Man narrowly avoids fate of Man from Nantucket and ...",
"nt": "Florida Man narrowly avoids fate of Man from Nantucket and Man from Martha's Vineyard",
"nkw": [
"vineyard",
"marthas",
"men",
"nantucket",
"successful",
"fate",
"weekooh",
"avoids",
"narrowly",
"florida",
"roads",
"shortly",
"river",
"subject",
"names",
"man",
"watercraft",
"somebody"
]
}
],
"2021-08-08": [
{
"u": "https://www.reddit.com/r/boston/comments/p0fwje/the_original_storrowing/",
"t": "r/boston - The Original Storrowing - Reddit",
"nt": "The Original Storrowing : boston",
"nkw": [
"original",
"rboston",
"mark",
"press",
"shortcutssearch",
"keyboard",
"learn",
"j",
"boston",
"jump",
"storrowing",
"rest",
"question"
]
}
],
"2021-08-09": [
{
"u": "https://www.reddit.com/r/onewheel/comments/p12s2w/some_people_are_the_worst_misuses_the_board/",
"t": "Some people are the worst. Misuses the board, ignores the ...",
"nt": "Some people are the worst. Misuses the board, ignores the safety warnings, falls then blames the onewheel. : onewheel",
"nkw": [
"possible",
"boardsport",
"board",
"allows",
"blames",
"warnings",
"safety",
"misuses",
"carve",
"thought",
"planet",
"onewheel",
"electric",
"ignores",
"worst",
"falls",
"earth",
"ways",
"revolutionary"
]
},
{
"u": "https://www.reddit.com/r/boston/comments/p0tno6/is_driving_in_boston_harder_on_purpose/",
"t": "Is driving in Boston harder on purpose? - Reddit",
"nt": "Is driving in Boston harder on purpose? : boston",
"nkw": [
"ive",
"harder",
"purpose",
"residents",
"questions",
"maps",
"wont",
"city",
"driving",
"california",
"wires",
"boston",
"im",
"wrong"
]
}
],
"2021-08-10": [
{
"u": "https://www.reddit.com/r/boston/comments/ozraxb/a_car_accident_in_boston_massachusetts_1927/",
"t": "A car accident in Boston, Massachusetts - 1927 - Reddit",
"nt": "A car accident in Boston, Massachusetts",
"nkw": [
"car",
"rboston",
"mark",
"press",
"shortcutssearch",
"accident",
"keyboard",
"learn",
"j",
"boston",
"jump",
"rest",
"question",
"massachusetts"
]
},
{
"u": "https://accordingtohoyt.com/2021/08/10/the-tale-of-the-flying-dutch-u-haul/",
"t": "The Tale of The Flying Dutch-U-Haul - According To Hoyt",
"nt": "The Tale of The Flying Dutch-U-Haul",
"nd": "2021-08-10",
"nkw": [
"young",
"tale",
"white",
"dutchuhaul",
"wood",
"flying",
"way",
"know",
"man",
"uhaul",
"furniture",
"good"
]
}
],
"2021-08-11": [
{
"u": "https://www.universalhub.com/2021/just-time-allston-christmas",
"t": "Just in time for Allston Christmas | Universal Hub",
"nt": "Just in time for Allston Christmas",
"nkw": [
"help",
"christmas",
"universal",
"hub",
"going",
"allston",
"contribution",
"consider",
"completely",
"nondeductible"
]
}
]
}

Loading…
Cancel
Save