Browse Source

Added article flagging functionality.

master
Trey Del Bonis 2 years ago
parent
commit
147d383cc7
4 changed files with 177 additions and 95 deletions
  1. 68
    6
      app.py
  2. 18
    11
      inventory.py
  3. 79
    73
      searchlib.py
  4. 12
    5
      templates/main.htm

+ 68
- 6
app.py View File



MAX_SEARCH_DAYS = 180 MAX_SEARCH_DAYS = 180
MAX_SHOW_DAYS = 20 MAX_SHOW_DAYS = 20
REPORT_HORIZON = 180
MAX_USER_REPORTS_PER_DAY = 3


################################ ################################
# Core configuration # Core configuration


return tmplts.TemplateResponse('main.htm', p) return tmplts.TemplateResponse('main.htm', p)


@app.post('/action/flag')
async def handle_flag(req: Request, date: str = Form(...), article: str = Form(...)):
ipaddr = req.client.host

try:
today = datetime.now()
pdate = datetime.strptime(date, inventory.DATE_FORMAT)
if pdate > today or (today - pdate).days > REPORT_HORIZON:
raise ValueError('bad date')
except Exception as e:
return JSONResponse({'status': 'error'}, status_code=400)

flags = await inventory.load_date_flags_async(pdate)

# Make sure it's not a duplicate and limit the number of reports
nreporter = 0
for e in flags:
if e['src'] == ipaddr:
if e['url'] == article:
return {'status': 'OK'}
nreporter += 1
if nreporter + 1 >= MAX_USER_REPORTS_PER_DAY:
print('user', ipaddr, 'looking sussy')
await reporthook.send_report('address %s made more reports for %s than allowed' % (ipaddr, date))
return JSONResponse({'status': 'error'}, status_code=429)

await reporthook.send_report('address %s reported url %s' % (ipaddr, article))

flags.append({
'src': ipaddr,
'url': article,
})

await inventory.save_date_flags_async(pdate, flags)

return make_html_redirect_response('/')

################################ ################################
# API endpoints # API endpoints
################################ ################################
for i in range(MAX_SEARCH_DAYS): for i in range(MAX_SEARCH_DAYS):
that_day = today - i * day_dur that_day = today - i * day_dur
report = await inventory.load_date_report_async(that_day) report = await inventory.load_date_report_async(that_day)
flags = await inventory.load_date_flags_async(that_day)
if len(report) > 0: if len(report) > 0:
reports[that_day.strftime(inventory.DATE_FORMAT)] = report
reports[that_day.strftime(inventory.DATE_FORMAT)] = {
'articles': report,
'flags': flags,
}


return reports return reports


def convert_days_from_articles(rarts):
processed = searchlib.process_results(rarts)
def convert_days_from_articles(days):
output = [] output = []


for dstr, arts in processed.items():
for dstr, parts in days.items():
dr = searchlib.process_day_results(dstr, parts['articles'])
flags = {e['url'] for e in parts['flags']}

day = { day = {
'date': dstr, 'date': dstr,
'links': [convert_article(a) for a in arts['pass']],
'maybe_links': [convert_article(a) for a in arts['maybe']]
'links': [],
'maybe_links': []
} }


# Process hard passes.
for a in dr['pass']:
ca = convert_article(a)
if a['url'] not in flags:
day['links'].append(ca)
else:
day['maybe_links'].append(ca)

# Process weak articles.
for a in dr['maybe']:
ca = convert_article(a)
if a['url'] not in flags:
day['maybe_links'].append(ca)

if len(day['links']) > 0: if len(day['links']) > 0:
output.append(day) output.append(day)


lowest = ndays lowest = ndays


return lowest return lowest

def make_html_redirect_response(url):
return HTMLResponse('<head><meta http-equiv="Refresh" content="0; URL=' + url + '"></head>')

+ 18
- 11
inventory.py View File

def get_datadir(): def get_datadir():
return os.getenv('SB_DATADIR') return os.getenv('SB_DATADIR')


def get_article_path(slug):
return os.path.join(get_datadir(), 'articles', slug + '.json')

def load_article_by_slug(slug):
with open(get_article_path(slug), 'r') as f:
return json.load(f)

def save_article(data):
with open(get_article_path(data['slug']), 'r') as f:
return json.save(data, f)

def get_date_report_path(date): def get_date_report_path(date):
dstr = date.strftime(DATE_FORMAT) dstr = date.strftime(DATE_FORMAT)
return os.path.join(get_datadir(), 'days', dstr + '.json') return os.path.join(get_datadir(), 'days', dstr + '.json')
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True) os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
async with aiofiles.open(path, 'w') as f: async with aiofiles.open(path, 'w') as f:
await f.write(json.dumps(data)) await f.write(json.dumps(data))

def get_date_flags_path(date):
dstr = date.strftime(DATE_FORMAT)
return os.path.join(get_datadir(), 'flags', dstr + '.json')

async def load_date_flags_async(date):
path = get_date_flags_path(date)
if not os.path.exists(path):
return []

async with aiofiles.open(path, 'r') as f:
return json.loads(await f.read())

async def save_date_flags_async(date, data):
path = get_date_flags_path(date)
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
async with aiofiles.open(path, 'w') as f:
await f.write(json.dumps(data))

+ 79
- 73
searchlib.py View File

def process_results(rtbl): def process_results(rtbl):
rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS] rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS] excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]

results = {} results = {}

for qdate, rents in rtbl.items(): for qdate, rents in rtbl.items():
for rent in rents:
uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])

# Check skip because of domain exclusion
skip_cuz_url = False
for reg in excl_domains:
if reg.fullmatch(unl):
skip_cuz_url = True
break
if skip_cuz_url:
continue
results[qdate] = _process_day_result(rw_domains, excl_domains, qdate, rents)
return results


# Rewrite the domain, if applicable
for reg, rw in rw_domains:
if reg.fullmatch(unl):
unl = rw
break
def process_day_results(query_date, rents):
rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
return _process_day_results(rw_domains, excl_domains, query_date, rents)


# Check skip because of URL path file extension
skip_cuz_pathext = False
for ext in EXCL_PATH_EXTS:
if upath.endswith(ext):
skip_cuz_pathext = True
break
if skip_cuz_pathext:
continue
def _process_day_results(rw_domains, excl_domains, qdate, rents):
articles = {
'pass': [],
'maybe': [],
}


has_kws = 'nkw' in rent

# Check skip because of hard exclusion keywords
skip_cuz_kws = False
if has_kws:
for kw in EXCL_KWS:
for akw in rent['nkw']:
if kw in akw:
skip_cuz_kws = True
if skip_cuz_kws:
break
for rent in rents:
uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])

# Check skip because of domain exclusion
skip_cuz_url = False
for reg in excl_domains:
if reg.fullmatch(unl):
skip_cuz_url = True
break
if skip_cuz_url:
continue

# Rewrite the domain, if applicable
for reg, rw in rw_domains:
if reg.fullmatch(unl):
unl = rw
break

# Check skip because of URL path file extension
skip_cuz_pathext = False
for ext in EXCL_PATH_EXTS:
if upath.endswith(ext):
skip_cuz_pathext = True
break
if skip_cuz_pathext:
continue

has_kws = 'nkw' in rent

# Check skip because of hard exclusion keywords
skip_cuz_kws = False
if has_kws:
for kw in EXCL_KWS:
for akw in rent['nkw']:
if kw in akw:
skip_cuz_kws = True
if skip_cuz_kws: if skip_cuz_kws:
continue

# Now characterize what kind of entry it is.
has_date = 'nd' in rent
has_title = 'nt' in rent
has_pass_kw = False
if has_kws:
for kw in PASS_KWS:
for akw in rent['nkw']:
if kw in akw:
has_pass_kw = True
if has_pass_kw:
break

# Try to assemble a record to store the thing.
eff_date = rent['nd'] if has_date else qdate
eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
item = {
'slug': gen_slug(rent['t'], eff_date),
'url': eff_url,
'gtitle': rent['t'],
'title': rent['nt'] if has_title else None,
'date': eff_date,
'kws': rent['nkw'] if has_kws else None
}

if eff_date not in results:
results[eff_date] = {
'pass': [],
'maybe': []
}
break
if skip_cuz_kws:
continue


if has_pass_kw:
results[eff_date]['pass'].append(item)
else:
results[eff_date]['maybe'].append(item)
# Now characterize what kind of entry it is.
has_date = 'nd' in rent
has_title = 'nt' in rent
has_pass_kw = False
if has_kws:
for kw in PASS_KWS:
for akw in rent['nkw']:
if kw in akw:
has_pass_kw = True
if has_pass_kw:
break


return results
# Try to assemble a record to store the thing.
eff_date = rent['nd'] if has_date else qdate
eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
item = {
'slug': gen_slug(rent['t'], eff_date),
'url': eff_url,
'gtitle': rent['t'],
'title': rent['nt'] if has_title else None,
'date': eff_date,
'kws': rent['nkw'] if has_kws else None
}

if has_pass_kw:
articles['pass'].append(item)
else:
articles['maybe'].append(item)

return articles


SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+') SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+')



+ 12
- 5
templates/main.htm View File

{% extends 'base.htm' %} {% extends 'base.htm' %}


{% macro render_article_link(desc) %}
<li><a href="{{ desc.url }}">{{ desc.title }}</a></li>
{% macro render_article_link(desc, date) %}
<li>
<a href="{{ desc.url }}">{{ desc.title }}</a>
<form action="/action/flag" method="POST">
<input type="hidden" name="date" value="{{ date }}"/>
<input type="hidden" name="article" value="{{ desc.url }}"/>
<input type="submit" value="Flag False Positive"/>
</form>
</li>
{% endmacro %} {% endmacro %}


{% block content %} {% block content %}
false-positives (which you can report soon!). There's also a way to request false-positives (which you can report soon!). There's also a way to request
incident data directly the MassDOT, but that's a lot more work to pull data incident data directly the MassDOT, but that's a lot more work to pull data
from and I'm not sure how quickly it's updated so I'll leave that to someone from and I'm not sure how quickly it's updated so I'll leave that to someone
more dedicated to work on.
more dedicated to <a href="https://code.tr3y.io/treyzania/storrowed.boston">work on</a>.
</p> </p>


<h1>Articles</h1> <h1>Articles</h1>
<h3>{{ d.date }}</h3> <h3>{{ d.date }}</h3>
<ul> <ul>
{% for l in d.links %} {% for l in d.links %}
{{ render_article_link(l) }}
{{ render_article_link(l, d.date) }}
{% endfor %} {% endfor %}
</ul> </ul>
{% if d.maybe_links|length > 0 %} {% if d.maybe_links|length > 0 %}
<summary><em>and {{ d.maybe_links|length }} weak match(es)</em></summary> <summary><em>and {{ d.maybe_links|length }} weak match(es)</em></summary>
<ul> <ul>
{% for l in d.maybe_links %} {% for l in d.maybe_links %}
{{ render_article_link(l) }}
{{ render_article_link(l, d.date) }}
{% endfor %} {% endfor %}
</ul> </ul>
</details> </details>

Loading…
Cancel
Save