@@ -18,6 +18,8 @@ import searchlib | |||
MAX_SEARCH_DAYS = 180 | |||
MAX_SHOW_DAYS = 20 | |||
REPORT_HORIZON = 180 | |||
MAX_USER_REPORTS_PER_DAY = 3 | |||
################################ | |||
# Core configuration | |||
@@ -77,6 +79,43 @@ async def render_main(req: Request): | |||
return tmplts.TemplateResponse('main.htm', p) | |||
@app.post('/action/flag') | |||
async def handle_flag(req: Request, date: str = Form(...), article: str = Form(...)): | |||
ipaddr = req.client.host | |||
try: | |||
today = datetime.now() | |||
pdate = datetime.strptime(date, inventory.DATE_FORMAT) | |||
if pdate > today or (today - pdate).days > REPORT_HORIZON: | |||
raise ValueError('bad date') | |||
except Exception as e: | |||
return JSONResponse({'status': 'error'}, status_code=400) | |||
flags = await inventory.load_date_flags_async(pdate) | |||
# Make sure it's not a duplicate and limit the number of reports | |||
nreporter = 0 | |||
for e in flags: | |||
if e['src'] == ipaddr: | |||
if e['url'] == article: | |||
return {'status': 'OK'} | |||
nreporter += 1 | |||
if nreporter + 1 >= MAX_USER_REPORTS_PER_DAY: | |||
print('user', ipaddr, 'looking sussy') | |||
await reporthook.send_report('address %s made more reports for %s than allowed' % (ipaddr, date)) | |||
return JSONResponse({'status': 'error'}, status_code=429) | |||
await reporthook.send_report('address %s reported url %s' % (ipaddr, article)) | |||
flags.append({ | |||
'src': ipaddr, | |||
'url': article, | |||
}) | |||
await inventory.save_date_flags_async(pdate, flags) | |||
return make_html_redirect_response('/') | |||
################################ | |||
# API endpoints | |||
################################ | |||
@@ -134,22 +173,42 @@ async def load_recent_articles(): | |||
for i in range(MAX_SEARCH_DAYS): | |||
that_day = today - i * day_dur | |||
report = await inventory.load_date_report_async(that_day) | |||
flags = await inventory.load_date_flags_async(that_day) | |||
if len(report) > 0: | |||
reports[that_day.strftime(inventory.DATE_FORMAT)] = report | |||
reports[that_day.strftime(inventory.DATE_FORMAT)] = { | |||
'articles': report, | |||
'flags': flags, | |||
} | |||
return reports | |||
def convert_days_from_articles(rarts): | |||
processed = searchlib.process_results(rarts) | |||
def convert_days_from_articles(days): | |||
output = [] | |||
for dstr, arts in processed.items(): | |||
for dstr, parts in days.items(): | |||
dr = searchlib.process_day_results(dstr, parts['articles']) | |||
flags = {e['url'] for e in parts['flags']} | |||
day = { | |||
'date': dstr, | |||
'links': [convert_article(a) for a in arts['pass']], | |||
'maybe_links': [convert_article(a) for a in arts['maybe']] | |||
'links': [], | |||
'maybe_links': [] | |||
} | |||
# Process hard passes. | |||
for a in dr['pass']: | |||
ca = convert_article(a) | |||
if a['url'] not in flags: | |||
day['links'].append(ca) | |||
else: | |||
day['maybe_links'].append(ca) | |||
# Process weak articles. | |||
for a in dr['maybe']: | |||
ca = convert_article(a) | |||
if a['url'] not in flags: | |||
day['maybe_links'].append(ca) | |||
if len(day['links']) > 0: | |||
output.append(day) | |||
@@ -177,3 +236,6 @@ def calc_num_days(dayslist): | |||
lowest = ndays | |||
return lowest | |||
def make_html_redirect_response(url): | |||
return HTMLResponse('<head><meta http-equiv="Refresh" content="0; URL=' + url + '"></head>') |
@@ -8,17 +8,6 @@ DATE_FORMAT = "%Y-%m-%d" | |||
def get_datadir(): | |||
return os.getenv('SB_DATADIR') | |||
def get_article_path(slug): | |||
return os.path.join(get_datadir(), 'articles', slug + '.json') | |||
def load_article_by_slug(slug): | |||
with open(get_article_path(slug), 'r') as f: | |||
return json.load(f) | |||
def save_article(data): | |||
with open(get_article_path(data['slug']), 'r') as f: | |||
return json.save(data, f) | |||
def get_date_report_path(date): | |||
dstr = date.strftime(DATE_FORMAT) | |||
return os.path.join(get_datadir(), 'days', dstr + '.json') | |||
@@ -50,3 +39,21 @@ async def save_date_report_async(date, data): | |||
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True) | |||
async with aiofiles.open(path, 'w') as f: | |||
await f.write(json.dumps(data)) | |||
def get_date_flags_path(date): | |||
dstr = date.strftime(DATE_FORMAT) | |||
return os.path.join(get_datadir(), 'flags', dstr + '.json') | |||
async def load_date_flags_async(date): | |||
path = get_date_flags_path(date) | |||
if not os.path.exists(path): | |||
return [] | |||
async with aiofiles.open(path, 'r') as f: | |||
return json.loads(await f.read()) | |||
async def save_date_flags_async(date, data): | |||
path = get_date_flags_path(date) | |||
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True) | |||
async with aiofiles.open(path, 'w') as f: | |||
await f.write(json.dumps(data)) |
@@ -51,87 +51,93 @@ PASS_KWS = [ | |||
def process_results(rtbl): | |||
rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS] | |||
excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS] | |||
results = {} | |||
for qdate, rents in rtbl.items(): | |||
for rent in rents: | |||
uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u']) | |||
# Check skip because of domain exclusion | |||
skip_cuz_url = False | |||
for reg in excl_domains: | |||
if reg.fullmatch(unl): | |||
skip_cuz_url = True | |||
break | |||
if skip_cuz_url: | |||
continue | |||
results[qdate] = _process_day_result(rw_domains, excl_domains, qdate, rents) | |||
return results | |||
# Rewrite the domain, if applicable | |||
for reg, rw in rw_domains: | |||
if reg.fullmatch(unl): | |||
unl = rw | |||
break | |||
def process_day_results(query_date, rents): | |||
rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS] | |||
excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS] | |||
return _process_day_results(rw_domains, excl_domains, query_date, rents) | |||
# Check skip because of URL path file extension | |||
skip_cuz_pathext = False | |||
for ext in EXCL_PATH_EXTS: | |||
if upath.endswith(ext): | |||
skip_cuz_pathext = True | |||
break | |||
if skip_cuz_pathext: | |||
continue | |||
def _process_day_results(rw_domains, excl_domains, qdate, rents): | |||
articles = { | |||
'pass': [], | |||
'maybe': [], | |||
} | |||
has_kws = 'nkw' in rent | |||
# Check skip because of hard exclusion keywords | |||
skip_cuz_kws = False | |||
if has_kws: | |||
for kw in EXCL_KWS: | |||
for akw in rent['nkw']: | |||
if kw in akw: | |||
skip_cuz_kws = True | |||
if skip_cuz_kws: | |||
break | |||
for rent in rents: | |||
uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u']) | |||
# Check skip because of domain exclusion | |||
skip_cuz_url = False | |||
for reg in excl_domains: | |||
if reg.fullmatch(unl): | |||
skip_cuz_url = True | |||
break | |||
if skip_cuz_url: | |||
continue | |||
# Rewrite the domain, if applicable | |||
for reg, rw in rw_domains: | |||
if reg.fullmatch(unl): | |||
unl = rw | |||
break | |||
# Check skip because of URL path file extension | |||
skip_cuz_pathext = False | |||
for ext in EXCL_PATH_EXTS: | |||
if upath.endswith(ext): | |||
skip_cuz_pathext = True | |||
break | |||
if skip_cuz_pathext: | |||
continue | |||
has_kws = 'nkw' in rent | |||
# Check skip because of hard exclusion keywords | |||
skip_cuz_kws = False | |||
if has_kws: | |||
for kw in EXCL_KWS: | |||
for akw in rent['nkw']: | |||
if kw in akw: | |||
skip_cuz_kws = True | |||
if skip_cuz_kws: | |||
continue | |||
# Now characterize what kind of entry it is. | |||
has_date = 'nd' in rent | |||
has_title = 'nt' in rent | |||
has_pass_kw = False | |||
if has_kws: | |||
for kw in PASS_KWS: | |||
for akw in rent['nkw']: | |||
if kw in akw: | |||
has_pass_kw = True | |||
if has_pass_kw: | |||
break | |||
# Try to assemble a record to store the thing. | |||
eff_date = rent['nd'] if has_date else qdate | |||
eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, '']) | |||
item = { | |||
'slug': gen_slug(rent['t'], eff_date), | |||
'url': eff_url, | |||
'gtitle': rent['t'], | |||
'title': rent['nt'] if has_title else None, | |||
'date': eff_date, | |||
'kws': rent['nkw'] if has_kws else None | |||
} | |||
if eff_date not in results: | |||
results[eff_date] = { | |||
'pass': [], | |||
'maybe': [] | |||
} | |||
break | |||
if skip_cuz_kws: | |||
continue | |||
if has_pass_kw: | |||
results[eff_date]['pass'].append(item) | |||
else: | |||
results[eff_date]['maybe'].append(item) | |||
# Now characterize what kind of entry it is. | |||
has_date = 'nd' in rent | |||
has_title = 'nt' in rent | |||
has_pass_kw = False | |||
if has_kws: | |||
for kw in PASS_KWS: | |||
for akw in rent['nkw']: | |||
if kw in akw: | |||
has_pass_kw = True | |||
if has_pass_kw: | |||
break | |||
return results | |||
# Try to assemble a record to store the thing. | |||
eff_date = rent['nd'] if has_date else qdate | |||
eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, '']) | |||
item = { | |||
'slug': gen_slug(rent['t'], eff_date), | |||
'url': eff_url, | |||
'gtitle': rent['t'], | |||
'title': rent['nt'] if has_title else None, | |||
'date': eff_date, | |||
'kws': rent['nkw'] if has_kws else None | |||
} | |||
if has_pass_kw: | |||
articles['pass'].append(item) | |||
else: | |||
articles['maybe'].append(item) | |||
return articles | |||
SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+') | |||
@@ -1,7 +1,14 @@ | |||
{% extends 'base.htm' %} | |||
{% macro render_article_link(desc) %} | |||
<li><a href="{{ desc.url }}">{{ desc.title }}</a></li> | |||
{% macro render_article_link(desc, date) %} | |||
<li> | |||
<a href="{{ desc.url }}">{{ desc.title }}</a> | |||
<form action="/action/flag" method="POST"> | |||
<input type="hidden" name="date" value="{{ date }}"/> | |||
<input type="hidden" name="article" value="{{ desc.url }}"/> | |||
<input type="submit" value="Flag False Positive"/> | |||
</form> | |||
</li> | |||
{% endmacro %} | |||
{% block content %} | |||
@@ -39,7 +46,7 @@ | |||
false-positives (which you can report soon!). There's also a way to request | |||
incident data directly the MassDOT, but that's a lot more work to pull data | |||
from and I'm not sure how quickly it's updated so I'll leave that to someone | |||
more dedicated to work on. | |||
more dedicated to <a href="https://code.tr3y.io/treyzania/storrowed.boston">work on</a>. | |||
</p> | |||
<h1>Articles</h1> | |||
@@ -58,7 +65,7 @@ | |||
<h3>{{ d.date }}</h3> | |||
<ul> | |||
{% for l in d.links %} | |||
{{ render_article_link(l) }} | |||
{{ render_article_link(l, d.date) }} | |||
{% endfor %} | |||
</ul> | |||
{% if d.maybe_links|length > 0 %} | |||
@@ -66,7 +73,7 @@ | |||
<summary><em>and {{ d.maybe_links|length }} weak match(es)</em></summary> | |||
<ul> | |||
{% for l in d.maybe_links %} | |||
{{ render_article_link(l) }} | |||
{{ render_article_link(l, d.date) }} | |||
{% endfor %} | |||
</ul> | |||
</details> |