2 years ago · 147d383cc7
--- a/app.py
+++ b/app.py
@@ -18,6 +18,8 @@ import searchlib

 MAX_SEARCH_DAYS = 180
 MAX_SHOW_DAYS = 20
 REPORT_HORIZON = 180
 MAX_USER_REPORTS_PER_DAY = 3

 ################################
 # Core configuration
@@ -77,6 +79,43 @@ async def render_main(req: Request):

    return tmplts.TemplateResponse('main.htm', p)

@app.post('/action/flag')
 async def handle_flag(req: Request, date: str = Form(...), article: str = Form(...)):
    ipaddr = req.client.host

    try:
        today = datetime.now()
        pdate = datetime.strptime(date, inventory.DATE_FORMAT)
        if pdate > today or (today - pdate).days > REPORT_HORIZON:
            raise ValueError('bad date')
    except Exception as e:
        return JSONResponse({'status': 'error'}, status_code=400)

    flags = await inventory.load_date_flags_async(pdate)

    # Make sure it's not a duplicate and limit the number of reports
    nreporter = 0
    for e in flags:
        if e['src'] == ipaddr:
            if e['url'] == article:
                return {'status': 'OK'}
            nreporter += 1
    if nreporter + 1 >= MAX_USER_REPORTS_PER_DAY:
        print('user', ipaddr, 'looking sussy')
        await reporthook.send_report('address %s made more reports for %s than allowed'  % (ipaddr, date))
        return JSONResponse({'status': 'error'}, status_code=429)

    await reporthook.send_report('address %s reported url %s' % (ipaddr, article))

    flags.append({
        'src': ipaddr,
        'url': article,
    })

    await inventory.save_date_flags_async(pdate, flags)

    return make_html_redirect_response('/')

 ################################
 # API endpoints
 ################################
@@ -134,22 +173,42 @@ async def load_recent_articles():
    for i in range(MAX_SEARCH_DAYS):
        that_day = today - i * day_dur
        report = await inventory.load_date_report_async(that_day)
        flags = await inventory.load_date_flags_async(that_day)
        if len(report) > 0:
            reports[that_day.strftime(inventory.DATE_FORMAT)] = report
            reports[that_day.strftime(inventory.DATE_FORMAT)] = {
                'articles': report,
                'flags': flags,
            }

    return reports

 def convert_days_from_articles(rarts):
    processed = searchlib.process_results(rarts)
 def convert_days_from_articles(days):
    output = []

    for dstr, arts in processed.items():
    for dstr, parts in days.items():
        dr = searchlib.process_day_results(dstr, parts['articles'])
        flags = {e['url'] for e in parts['flags']}

        day = {
            'date': dstr,
            'links': [convert_article(a) for a in arts['pass']],
            'maybe_links': [convert_article(a) for a in arts['maybe']]
            'links': [],
            'maybe_links': []
        }

        # Process hard passes.
        for a in dr['pass']:
            ca = convert_article(a)
            if a['url'] not in flags:
                day['links'].append(ca)
            else:
                day['maybe_links'].append(ca)

        # Process weak articles.
        for a in dr['maybe']:
            ca = convert_article(a)
            if a['url'] not in flags:
                day['maybe_links'].append(ca)

        if len(day['links']) > 0:
            output.append(day)

@@ -177,3 +236,6 @@ def calc_num_days(dayslist):
            lowest = ndays

    return lowest

 def make_html_redirect_response(url):
    return HTMLResponse('<head><meta http-equiv="Refresh" content="0; URL=' + url + '"></head>')
--- a/inventory.py
+++ b/inventory.py
@@ -8,17 +8,6 @@ DATE_FORMAT = "%Y-%m-%d"
 def get_datadir():
    return os.getenv('SB_DATADIR')

 def get_article_path(slug):
    return os.path.join(get_datadir(), 'articles', slug + '.json')

 def load_article_by_slug(slug):
    with open(get_article_path(slug), 'r') as f:
        return json.load(f)

 def save_article(data):
    with open(get_article_path(data['slug']), 'r') as f:
        return json.save(data, f)

 def get_date_report_path(date):
    dstr = date.strftime(DATE_FORMAT)
    return os.path.join(get_datadir(), 'days', dstr + '.json')
@@ -50,3 +39,21 @@ async def save_date_report_async(date, data):
    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
    async with aiofiles.open(path, 'w') as f:
        await f.write(json.dumps(data))

 def get_date_flags_path(date):
    dstr = date.strftime(DATE_FORMAT)
    return os.path.join(get_datadir(), 'flags', dstr + '.json')

 async def load_date_flags_async(date):
    path = get_date_flags_path(date)
    if not os.path.exists(path):
        return []

    async with aiofiles.open(path, 'r') as f:
        return json.loads(await f.read())

 async def save_date_flags_async(date, data):
    path = get_date_flags_path(date)
    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
    async with aiofiles.open(path, 'w') as f:
        await f.write(json.dumps(data))
--- a/searchlib.py
+++ b/searchlib.py
@@ -51,87 +51,93 @@ PASS_KWS = [
 def process_results(rtbl):
    rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
    excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]

    results = {}

    for qdate, rents in rtbl.items():
        for rent in rents:
            uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])

            # Check skip because of domain exclusion
            skip_cuz_url = False
            for reg in excl_domains:
                if reg.fullmatch(unl):
                    skip_cuz_url = True
                    break
            if skip_cuz_url:
                continue
        results[qdate] = _process_day_result(rw_domains, excl_domains, qdate, rents)
    return results

            # Rewrite the domain, if applicable
            for reg, rw in rw_domains:
                if reg.fullmatch(unl):
                    unl = rw
                    break
 def process_day_results(query_date, rents):
    rw_domains = [(re.compile(reg), rw) for reg, rw in REWRITE_DOMAINS]
    excl_domains = [re.compile(regex) for regex in EXCL_DOMAINS]
    return _process_day_results(rw_domains, excl_domains, query_date, rents)

            # Check skip because of URL path file extension
            skip_cuz_pathext = False
            for ext in EXCL_PATH_EXTS:
                if upath.endswith(ext):
                    skip_cuz_pathext = True
                    break
            if skip_cuz_pathext:
                continue
 def _process_day_results(rw_domains, excl_domains, qdate, rents):
    articles = {
        'pass': [],
        'maybe': [],
    }

            has_kws = 'nkw' in rent

            # Check skip because of hard exclusion keywords
            skip_cuz_kws = False
            if has_kws:
                for kw in EXCL_KWS:
                    for akw in rent['nkw']:
                        if kw in akw:
                            skip_cuz_kws = True
                    if skip_cuz_kws:
                                break
    for rent in rents:
        uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])

        # Check skip because of domain exclusion
        skip_cuz_url = False
        for reg in excl_domains:
            if reg.fullmatch(unl):
                skip_cuz_url = True
                break
        if skip_cuz_url:
            continue

        # Rewrite the domain, if applicable
        for reg, rw in rw_domains:
            if reg.fullmatch(unl):
                unl = rw
                break

        # Check skip because of URL path file extension
        skip_cuz_pathext = False
        for ext in EXCL_PATH_EXTS:
            if upath.endswith(ext):
                skip_cuz_pathext = True
                break
        if skip_cuz_pathext:
            continue

        has_kws = 'nkw' in rent

        # Check skip because of hard exclusion keywords
        skip_cuz_kws = False
        if has_kws:
            for kw in EXCL_KWS:
                for akw in rent['nkw']:
                    if kw in akw:
                        skip_cuz_kws = True
                if skip_cuz_kws:
                    continue

            # Now characterize what kind of entry it is.
            has_date = 'nd' in rent
            has_title = 'nt' in rent
            has_pass_kw = False
            if has_kws:
                for kw in PASS_KWS:
                    for akw in rent['nkw']:
                        if kw in akw:
                            has_pass_kw = True
                    if has_pass_kw:
                        break

            # Try to assemble a record to store the thing.
            eff_date = rent['nd'] if has_date else qdate
            eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
            item = {
                'slug': gen_slug(rent['t'], eff_date),
                'url': eff_url,
                'gtitle': rent['t'],
                'title': rent['nt'] if has_title else None,
                'date': eff_date,
                'kws': rent['nkw'] if has_kws else None
            }

            if eff_date not in results:
                results[eff_date] = {
                    'pass': [],
                    'maybe': []
                }
                    break
            if skip_cuz_kws:
                continue

            if has_pass_kw:
                results[eff_date]['pass'].append(item)
            else:
                results[eff_date]['maybe'].append(item)
        # Now characterize what kind of entry it is.
        has_date = 'nd' in rent
        has_title = 'nt' in rent
        has_pass_kw = False
        if has_kws:
            for kw in PASS_KWS:
                for akw in rent['nkw']:
                    if kw in akw:
                        has_pass_kw = True
                if has_pass_kw:
                    break

    return results
        # Try to assemble a record to store the thing.
        eff_date = rent['nd'] if has_date else qdate
        eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
        item = {
            'slug': gen_slug(rent['t'], eff_date),
            'url': eff_url,
            'gtitle': rent['t'],
            'title': rent['nt'] if has_title else None,
            'date': eff_date,
            'kws': rent['nkw'] if has_kws else None
        }

        if has_pass_kw:
            articles['pass'].append(item)
        else:
            articles['maybe'].append(item)

    return articles

 SLUG_CHUNKS = re.compile('[abcdefghijklmnopqrstuvwxyz1234567890]+')

--- a/templates/main.htm
+++ b/templates/main.htm
@@ -1,7 +1,14 @@
 {% extends 'base.htm' %}

 {% macro render_article_link(desc) %}
 <li><a href="{{ desc.url }}">{{ desc.title }}</a></li>
 {% macro render_article_link(desc, date) %}
 <li>
  <a href="{{ desc.url }}">{{ desc.title }}</a>
  <form action="/action/flag" method="POST">
    <input type="hidden" name="date" value="{{ date }}"/>
    <input type="hidden" name="article" value="{{ desc.url }}"/>
    <input type="submit" value="Flag False Positive"/>
  </form>
 </li>
 {% endmacro %}

 {% block content %}
@@ -39,7 +46,7 @@
    false-positives (which you can report soon!).  There's also a way to request
    incident data directly the MassDOT, but that's a lot more work to pull data
    from and I'm not sure how quickly it's updated so I'll leave that to someone
    more dedicated to work on.
    more dedicated to <a href="https://code.tr3y.io/treyzania/storrowed.boston">work on</a>.
  </p>

  <h1>Articles</h1>
@@ -58,7 +65,7 @@
      <h3>{{ d.date }}</h3>
      <ul>
        {% for l in d.links %}
        {{ render_article_link(l) }}
        {{ render_article_link(l, d.date) }}
        {% endfor %}
      </ul>
      {% if d.maybe_links|length > 0 %}
@@ -66,7 +73,7 @@
        <summary><em>and {{ d.maybe_links|length }} weak match(es)</em></summary>
        <ul>
          {% for l in d.maybe_links %}
          {{ render_article_link(l) }}
          {{ render_article_link(l, d.date) }}
          {% endfor %}
        </ul>
      </details>