Browse Source

Added article flagging functionality.

master
Trey Del Bonis 2 years ago
parent
commit
147d383cc7
4 changed files with 177 additions and 95 deletions
  1. 68
    6
      app.py
  2. 18
    11
      inventory.py
  3. 79
    73
      searchlib.py
  4. 12
    5
      templates/main.htm

+ 68
- 6
app.py View File

@@ -18,6 +18,8 @@ import searchlib

MAX_SEARCH_DAYS = 180
MAX_SHOW_DAYS = 20
REPORT_HORIZON = 180
MAX_USER_REPORTS_PER_DAY = 3

################################
# Core configuration
@@ -77,6 +79,43 @@ async def render_main(req: Request):

return tmplts.TemplateResponse('main.htm', p)

@app.post('/action/flag')
async def handle_flag(req: Request, date: str = Form(...), article: str = Form(...)):
ipaddr = req.client.host

try:
today = datetime.now()
pdate = datetime.strptime(date, inventory.DATE_FORMAT)
if pdate > today or (today - pdate).days > REPORT_HORIZON:
raise ValueError('bad date')
except Exception as e:
return JSONResponse({'status': 'error'}, status_code=400)

flags = await inventory.load_date_flags_async(pdate)

# Make sure it's not a duplicate and limit the number of reports
nreporter = 0
for e in flags:
if e['src'] == ipaddr:
if e['url'] == article:
return {'status': 'OK'}
nreporter += 1
if nreporter + 1 >= MAX_USER_REPORTS_PER_DAY:
print('user', ipaddr, 'looking sussy')
await reporthook.send_report('address %s made more reports for %s than allowed' % (ipaddr, date))
return JSONResponse({'status': 'error'}, status_code=429)

await reporthook.send_report('address %s reported url %s' % (ipaddr, article))

flags.append({
'src': ipaddr,
'url': article,
})

await inventory.save_date_flags_async(pdate, flags)

return make_html_redirect_response('/')

################################
# API endpoints
################################
@@ -134,22 +173,42 @@ async def load_recent_articles():
for i in range(MAX_SEARCH_DAYS):
that_day = today - i * day_dur
report = await inventory.load_date_report_async(that_day)
flags = await inventory.load_date_flags_async(that_day)
if len(report) > 0:
reports[that_day.strftime(inventory.DATE_FORMAT)] = report
reports[that_day.strftime(inventory.DATE_FORMAT)] = {
'articles': report,
'flags': flags,
}

return reports

def convert_days_from_articles(rarts):
processed = searchlib.process_results(rarts)
def convert_days_from_articles(days):
output = []

for dstr, arts in processed.items():
for dstr, parts in days.items():
dr = searchlib.process_day_results(dstr, parts['articles'])
flags = {e['url'] for e in parts['flags']}

day = {
'date': dstr,
'links': [convert_article(a) for a in arts['pass']],
'maybe_links': [convert_article(a) for a in arts['maybe']]
'links': [],
'maybe_links': []
}

# Process hard passes.
for a in dr['pass']:
ca = convert_article(a)
if a['url'] not in flags:
day['links'].append(ca)
else:
day['maybe_links'].append(ca)

# Process weak articles.
for a in dr['maybe']:
ca = convert_article(a)
if a['url'] not in flags:
day['maybe_links'].append(ca)

if len(day['links']) > 0:
output.append(day)

@@ -177,3 +236,6 @@ def calc_num_days(dayslist):
lowest = ndays

return lowest

def make_html_redirect_response(url):
return HTMLResponse('<head><meta http-equiv="Refresh" content="0; URL=' + url + '"></head>')

+ 18
- 11
inventory.py View File

@@ -8,17 +8,6 @@ DATE_FORMAT = "%Y-%m-%d"
def get_datadir():
return os.getenv('SB_DATADIR')

def get_article_path(slug):
return os.path.join(get_datadir(), 'articles', slug + '.json')

def load_article_by_slug(slug):
with open(get_article_path(slug), 'r') as f:
return json.load(f)

def save_article(data):
with open(get_article_path(data['slug']), 'r') as f:
return json.save(data, f)

def get_date_report_path(date):
dstr = date.strftime(DATE_FORMAT)
return os.path.join(get_datadir(), 'days', dstr + '.json')
@@ -50,3 +39,21 @@ async def save_date_report_async(date, data):
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
async with aiofiles.open(path, 'w') as f:
await f.write(json.dumps(data))

def get_date_flags_path(date):
dstr = date.strftime(DATE_FORMAT)
return os.path.join(get_datadir(), 'flags', dstr + '.json')

async def load_date_flags_async(date):
path = get_date_flags_path(date)
if not os.path.exists(path):
return []

async with aiofiles.open(path, 'r') as f:
return json.loads(await f.read())

async def save_date_flags_async(date, data):
path = get_date_flags_path(date)
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
async with aiofiles.open(path, 'w') as f:
await f.write(json.dumps(data))

+ 79
- 73
searchlib.py View File

@@ -51,87 +51,93 @@ PASS_KWS = [
def process_results(rtbl):
rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]

results = {}

for qdate, rents in rtbl.items():
for rent in rents:
uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])

# Check skip because of domain exclusion
skip_cuz_url = False
for reg in excl_domains:
if reg.fullmatch(unl):
skip_cuz_url = True
break
if skip_cuz_url:
continue
results[qdate] = _process_day_result(rw_domains, excl_domains, qdate, rents)
return results

# Rewrite the domain, if applicable
for reg, rw in rw_domains:
if reg.fullmatch(unl):
unl = rw
break
def process_day_results(query_date, rents):
rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
return _process_day_results(rw_domains, excl_domains, query_date, rents)

# Check skip because of URL path file extension
skip_cuz_pathext = False
for ext in EXCL_PATH_EXTS:
if upath.endswith(ext):
skip_cuz_pathext = True
break
if skip_cuz_pathext:
continue
def _process_day_results(rw_domains, excl_domains, qdate, rents):
articles = {
'pass': [],
'maybe': [],
}

has_kws = 'nkw' in rent

# Check skip because of hard exclusion keywords
skip_cuz_kws = False
if has_kws:
for kw in EXCL_KWS:
for akw in rent['nkw']:
if kw in akw:
skip_cuz_kws = True
if skip_cuz_kws:
break
for rent in rents:
uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])

# Check skip because of domain exclusion
skip_cuz_url = False
for reg in excl_domains:
if reg.fullmatch(unl):
skip_cuz_url = True
break
if skip_cuz_url:
continue

# Rewrite the domain, if applicable
for reg, rw in rw_domains:
if reg.fullmatch(unl):
unl = rw
break

# Check skip because of URL path file extension
skip_cuz_pathext = False
for ext in EXCL_PATH_EXTS:
if upath.endswith(ext):
skip_cuz_pathext = True
break
if skip_cuz_pathext:
continue

has_kws = 'nkw' in rent

# Check skip because of hard exclusion keywords
skip_cuz_kws = False
if has_kws:
for kw in EXCL_KWS:
for akw in rent['nkw']:
if kw in akw:
skip_cuz_kws = True
if skip_cuz_kws:
continue

# Now characterize what kind of entry it is.
has_date = 'nd' in rent
has_title = 'nt' in rent
has_pass_kw = False
if has_kws:
for kw in PASS_KWS:
for akw in rent['nkw']:
if kw in akw:
has_pass_kw = True
if has_pass_kw:
break

# Try to assemble a record to store the thing.
eff_date = rent['nd'] if has_date else qdate
eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
item = {
'slug': gen_slug(rent['t'], eff_date),
'url': eff_url,
'gtitle': rent['t'],
'title': rent['nt'] if has_title else None,
'date': eff_date,
'kws': rent['nkw'] if has_kws else None
}

if eff_date not in results:
results[eff_date] = {
'pass': [],
'maybe': []
}
break
if skip_cuz_kws:
continue

if has_pass_kw:
results[eff_date]['pass'].append(item)
else:
results[eff_date]['maybe'].append(item)
# Now characterize what kind of entry it is.
has_date = 'nd' in rent
has_title = 'nt' in rent
has_pass_kw = False
if has_kws:
for kw in PASS_KWS:
for akw in rent['nkw']:
if kw in akw:
has_pass_kw = True
if has_pass_kw:
break

return results
# Try to assemble a record to store the thing.
eff_date = rent['nd'] if has_date else qdate
eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
item = {
'slug': gen_slug(rent['t'], eff_date),
'url': eff_url,
'gtitle': rent['t'],
'title': rent['nt'] if has_title else None,
'date': eff_date,
'kws': rent['nkw'] if has_kws else None
}

if has_pass_kw:
articles['pass'].append(item)
else:
articles['maybe'].append(item)

return articles

SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+')


+ 12
- 5
templates/main.htm View File

@@ -1,7 +1,14 @@
{% extends 'base.htm' %}

{% macro render_article_link(desc) %}
<li><a href="{{ desc.url }}">{{ desc.title }}</a></li>
{% macro render_article_link(desc, date) %}
<li>
<a href="{{ desc.url }}">{{ desc.title }}</a>
<form action="/action/flag" method="POST">
<input type="hidden" name="date" value="{{ date }}"/>
<input type="hidden" name="article" value="{{ desc.url }}"/>
<input type="submit" value="Flag False Positive"/>
</form>
</li>
{% endmacro %}

{% block content %}
@@ -39,7 +46,7 @@
false-positives (which you can report soon!). There's also a way to request
incident data directly the MassDOT, but that's a lot more work to pull data
from and I'm not sure how quickly it's updated so I'll leave that to someone
more dedicated to work on.
more dedicated to <a href="https://code.tr3y.io/treyzania/storrowed.boston">work on</a>.
</p>

<h1>Articles</h1>
@@ -58,7 +65,7 @@
<h3>{{ d.date }}</h3>
<ul>
{% for l in d.links %}
{{ render_article_link(l) }}
{{ render_article_link(l, d.date) }}
{% endfor %}
</ul>
{% if d.maybe_links|length > 0 %}
@@ -66,7 +73,7 @@
<summary><em>and {{ d.maybe_links|length }} weak match(es)</em></summary>
<ul>
{% for l in d.maybe_links %}
{{ render_article_link(l) }}
{{ render_article_link(l, d.date) }}
{% endfor %}
</ul>
</details>

Loading…
Cancel
Save