Browse Source

Added most of article submission tooling and tweaked query algo.

master
Trey Del Bonis 2 years ago
parent
commit
c6e24f3f90
7 changed files with 132 additions and 23 deletions
  1. 1
    0
      Pipfile
  2. 34
    7
      Pipfile.lock
  3. 24
    9
      app.py
  4. 27
    3
      inventory.py
  5. 6
    2
      query.py
  6. 7
    2
      searchlib.py
  7. 33
    0
      uploadresults.py

+ 1
- 0
Pipfile View File

@@ -15,6 +15,7 @@ uvicorn = { version = "*", extras = ["standard"] }
jinja2 = "*"
aiofile = "*"
httpx = "==0.18"
requests = "*"

[dev-packages]


+ 34
- 7
Pipfile.lock View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "7946547c9b8ce1b4406c56e7a25730654b11ee233284a40b4801384c87363fb5"
"sha256": "85a4e859bb95a38f4fde2354fafd837a7bf943f479d50c570be3d686dd1028e9"
},
"pipfile-spec": 6,
"requires": {
@@ -34,10 +34,10 @@
},
"anyio": {
"hashes": [
"sha256:4fd09a25ab7fa01d34512b7249e366cd10358cdafc95022c7ff8c8f8a5026d66",
"sha256:67da67b5b21f96b9d3d65daa6ea99f5d5282cb09f50eb4456f8fb51dffefc3ff"
"sha256:24adc69309fb5779bc1e06158e143e0b6d2c56b302a3ac3de3083c705a6ed39d",
"sha256:2855a9423524abcdd652d942f8932fda1735210f77a6b392eafd9ff34d3fe020"
],
"version": "==3.3.4"
"version": "==3.4.0"
},
"asgiref": {
"hashes": [
@@ -138,10 +138,10 @@
},
"filelock": {
"hashes": [
"sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
"sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
"sha256:2e139a228bcf56dd8b2274a65174d005c4a6b68540ee0bdbb92c76f43f29f7e8",
"sha256:93d512b32a23baf4cac44ffd72ccf70732aeff7b8050fcaf6d3ec406d954baf4"
],
"version": "==3.3.2"
"version": "==3.4.0"
},
"googlesearch-python": {
"hashes": [
@@ -509,23 +509,38 @@
},
"regex": {
"hashes": [
"sha256:0416f7399e918c4b0e074a0f66e5191077ee2ca32a0f99d4c187a62beb47aa05",
"sha256:05b7d6d7e64efe309972adab77fc2af8907bb93217ec60aa9fe12a0dad35874f",
"sha256:0617383e2fe465732af4509e61648b77cbe3aee68b6ac8c0b6fe934db90be5cc",
"sha256:07856afef5ffcc052e7eccf3213317fbb94e4a5cd8177a2caa69c980657b3cb4",
"sha256:0f594b96fe2e0821d026365f72ac7b4f0b487487fb3d4aaf10dd9d97d88a9737",
"sha256:139a23d1f5d30db2cc6c7fd9c6d6497872a672db22c4ae1910be22d4f4b2068a",
"sha256:162abfd74e88001d20cb73ceaffbfe601469923e875caf9118333b1a4aaafdc4",
"sha256:2207ae4f64ad3af399e2d30dde66f0b36ae5c3129b52885f1bffc2f05ec505c8",
"sha256:2409b5c9cef7054dde93a9803156b411b677affc84fca69e908b1cb2c540025d",
"sha256:2fee3ed82a011184807d2127f1733b4f6b2ff6ec7151d83ef3477f3b96a13d03",
"sha256:30ab804ea73972049b7a2a5c62d97687d69b5a60a67adca07eb73a0ddbc9e29f",
"sha256:3598893bde43091ee5ca0a6ad20f08a0435e93a69255eeb5f81b85e81e329264",
"sha256:3b5df18db1fccd66de15aa59c41e4f853b5df7550723d26aa6cb7f40e5d9da5a",
"sha256:3c5fb32cc6077abad3bbf0323067636d93307c9fa93e072771cf9a64d1c0f3ef",
"sha256:416c5f1a188c91e3eb41e9c8787288e707f7d2ebe66e0a6563af280d9b68478f",
"sha256:42b50fa6666b0d50c30a990527127334d6b96dd969011e843e726a64011485da",
"sha256:432bd15d40ed835a51617521d60d0125867f7b88acf653e4ed994a1f8e4995dc",
"sha256:473e67837f786404570eae33c3b64a4b9635ae9f00145250851a1292f484c063",
"sha256:4aaa4e0705ef2b73dd8e36eeb4c868f80f8393f5f4d855e94025ce7ad8525f50",
"sha256:50a7ddf3d131dc5633dccdb51417e2d1910d25cbcf842115a3a5893509140a3a",
"sha256:529801a0d58809b60b3531ee804d3e3be4b412c94b5d267daa3de7fadef00f49",
"sha256:537ca6a3586931b16a85ac38c08cc48f10fc870a5b25e51794c74df843e9966d",
"sha256:53db2c6be8a2710b359bfd3d3aa17ba38f8aa72a82309a12ae99d3c0c3dcd74d",
"sha256:5537f71b6d646f7f5f340562ec4c77b6e1c915f8baae822ea0b7e46c1f09b733",
"sha256:563d5f9354e15e048465061509403f68424fef37d5add3064038c2511c8f5e00",
"sha256:5d408a642a5484b9b4d11dea15a489ea0928c7e410c7525cd892f4d04f2f617b",
"sha256:61600a7ca4bcf78a96a68a27c2ae9389763b5b94b63943d5158f2a377e09d29a",
"sha256:6650f16365f1924d6014d2ea770bde8555b4a39dc9576abb95e3cd1ff0263b36",
"sha256:666abff54e474d28ff42756d94544cdfd42e2ee97065857413b72e8a2d6a6345",
"sha256:68a067c11463de2a37157930d8b153005085e42bcb7ad9ca562d77ba7d1404e0",
"sha256:6e1d2cc79e8dae442b3fa4a26c5794428b98f81389af90623ffcc650ce9f6732",
"sha256:74cbeac0451f27d4f50e6e8a8f3a52ca074b5e2da9f7b505c4201a57a8ed6286",
"sha256:780b48456a0f0ba4d390e8b5f7c661fdd218934388cde1a974010a965e200e12",
"sha256:788aef3549f1924d5c38263104dae7395bf020a42776d5ec5ea2b0d3d85d6646",
"sha256:7ee1227cf08b6716c85504aebc49ac827eb88fcc6e51564f010f11a406c0a667",
@@ -535,24 +550,34 @@
"sha256:9345b6f7ee578bad8e475129ed40123d265464c4cfead6c261fd60fc9de00bcf",
"sha256:93a5051fcf5fad72de73b96f07d30bc29665697fb8ecdfbc474f3452c78adcf4",
"sha256:962b9a917dd7ceacbe5cd424556914cb0d636001e393b43dc886ba31d2a1e449",
"sha256:96fc32c16ea6d60d3ca7f63397bff5c75c5a562f7db6dec7d412f7c4d2e78ec0",
"sha256:98ba568e8ae26beb726aeea2273053c717641933836568c2a0278a84987b2a1a",
"sha256:a3feefd5e95871872673b08636f96b61ebef62971eab044f5124fb4dea39919d",
"sha256:a955b747d620a50408b7fdf948e04359d6e762ff8a85f5775d907ceced715129",
"sha256:b43c2b8a330a490daaef5a47ab114935002b13b3f9dc5da56d5322ff218eeadb",
"sha256:b483c9d00a565633c87abd0aaf27eb5016de23fed952e054ecc19ce32f6a9e7e",
"sha256:b9ed0b1e5e0759d6b7f8e2f143894b2a7f3edd313f38cf44e1e15d360e11749b",
"sha256:ba05430e819e58544e840a68b03b28b6d328aff2e41579037e8bab7653b37d83",
"sha256:ca49e1ab99593438b204e00f3970e7a5f70d045267051dfa6b5f4304fcfa1dbf",
"sha256:ca5f18a75e1256ce07494e245cdb146f5a9267d3c702ebf9b65c7f8bd843431e",
"sha256:cd410a1cbb2d297c67d8521759ab2ee3f1d66206d2e4328502a487589a2cb21b",
"sha256:ce298e3d0c65bd03fa65ffcc6db0e2b578e8f626d468db64fdf8457731052942",
"sha256:d5ca078bb666c4a9d1287a379fe617a6dccd18c3e8a7e6c7e1eb8974330c626a",
"sha256:d5fd67df77bab0d3f4ea1d7afca9ef15c2ee35dfb348c7b57ffb9782a6e4db6e",
"sha256:da1a90c1ddb7531b1d5ff1e171b4ee61f6345119be7351104b67ff413843fe94",
"sha256:dba70f30fd81f8ce6d32ddeef37d91c8948e5d5a4c63242d16a2b2df8143aafc",
"sha256:dc07f021ee80510f3cd3af2cad5b6a3b3a10b057521d9e6aaeb621730d320c5a",
"sha256:dd33eb9bdcfbabab3459c9ee651d94c842bc8a05fabc95edf4ee0c15a072495e",
"sha256:e0538c43565ee6e703d3a7c3bdfe4037a5209250e8502c98f20fea6f5fdf2965",
"sha256:e1f54b9b4b6c53369f40028d2dd07a8c374583417ee6ec0ea304e710a20f80a0",
"sha256:e32d2a2b02ccbef10145df9135751abea1f9f076e67a4e261b05f24b94219e36",
"sha256:e6096b0688e6e14af6a1b10eaad86b4ff17935c49aa774eac7c95a57a4e8c296",
"sha256:e71255ba42567d34a13c03968736c5d39bb4a97ce98188fafb27ce981115beec",
"sha256:ed2e07c6a26ed4bea91b897ee2b0835c21716d9a469a96c3e878dc5f8c55bb23",
"sha256:eef2afb0fd1747f33f1ee3e209bce1ed582d1896b240ccc5e2697e3275f037c7",
"sha256:f23222527b307970e383433daec128d769ff778d9b29343fb3496472dc20dabe",
"sha256:f341ee2df0999bfdf7a95e448075effe0db212a59387de1a70690e4acb03d4c6",
"sha256:f5be7805e53dafe94d295399cfbe5227f39995a997f4fd8539bf3cbdc8f47ca8",
"sha256:f7f325be2804246a75a4f45c72d4ce80d2443ab815063cdf70ee8fb2ca59ee1b",
"sha256:f8af619e3be812a2059b212064ea7a640aff0568d972cd1b9e920837469eb3cb",
"sha256:fa8c626d6441e2d04b6ee703ef2d1e17608ad44c7cb75258c09dd42bacdfc64b",
@@ -566,6 +591,7 @@
"sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
"sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
],
"index": "pypi",
"version": "==2.25.1"
},
"requests-file": {
@@ -639,6 +665,7 @@
},
"typing-extensions": {
"hashes": [
"sha256:2cdf80e4e04866a9b3689a51869016d36db0814d84b8d8a568d22781d45d27ed",
"sha256:829704698b22e13ec9eaf959122315eabb370b0884400e9818334d8b677023d9"
],
"version": "==4.0.0"

+ 24
- 9
app.py View File

@@ -1,5 +1,5 @@

from datetime import datetime
from datetime import datetime, timedelta
import json
import traceback

@@ -12,6 +12,7 @@ from fastapi.templating import Jinja2Templates
from fastapi.middleware.cors import CORSMiddleware

import reporthook
import inventory
import sbenv
import searchlib

@@ -56,7 +57,7 @@ async def handle_exception(req: Request, exc: Exception):

@app.get('/')
async def render_main(req: Request):
raw_articles = await load_days_from_file('testresults.json')
raw_articles = await load_recent_articles()
converted = convert_days_from_articles(raw_articles)
num_days = calc_num_days(converted)

@@ -80,13 +81,16 @@ async def handle_addarticle(req: Request):
return JSONResponse(status_code=403, content={'error': 'forbidden'})

body = await req.json()
add_article(body)
await add_article(body['date'], body['desc'])

return {'status': 'OK'}

def add_article(article):
# TODO
pass
async def add_article(datestr, adesc):
date = datetime.strptime(datestr, inventory.DATE_FORMAT)

articles = await inventory.load_date_report_async(date)
articles.append(adesc)
await inventory.save_date_report_async(date, articles)

################################
# Utilities
@@ -116,6 +120,19 @@ async def load_days_from_file(path):
contents = await f.read()
return json.loads(contents)

async def load_recent_articles():
today = datetime.now()
day_dur = timedelta(days=1)
reports = {}

for i in range(MAX_SEARCH_DAYS):
that_day = today - i * day_dur
report = await inventory.load_date_report_async(that_day)
if len(report) > 0:
reports[that_day.strftime(inventory.DATE_FORMAT)] = report

return reports

def convert_days_from_articles(rarts):
processed = searchlib.process_results(rarts)
output = []
@@ -142,14 +159,12 @@ def convert_article(a):
'slug': a['slug'],
}

DATE_FORMAT = "%Y-%m-%d"

def calc_num_days(dayslist):
today = datetime.now()
lowest = -1

for d in dayslist:
pd = datetime.strptime(d['date'], DATE_FORMAT)
pd = datetime.strptime(d['date'], inventory.DATE_FORMAT)
diff = today - pd
ndays = diff.days
if ndays < lowest or lowest == -1:

+ 27
- 3
inventory.py View File

@@ -1,6 +1,10 @@
import os
import json

import aiofiles

DATE_FORMAT = "%Y-%m-%d"

def get_datadir():
return os.getenv('SB_DATADIR')

@@ -16,13 +20,33 @@ def save_article(data):
return json.save(data, f)

def get_date_report_path(date):
dstr = date.strftime('%Y-%m-%d')
dstr = date.strftime(DATE_FORMAT)
return os.path.join(get_datadir(), 'days', dstr + '.json')

def load_date_report(date):
with open(get_date_report_path(date), 'r') as f:
path = get_date_report_path(date)
if not os.path.exists(path):
return []

with open(path, 'r') as f:
return json.load(f)

async def load_date_report_async(date):
path = get_date_report_path(date)
if not os.path.exists(path):
return []

async with aiofiles.open(path, 'r') as f:
return json.loads(await f.read())

def save_date_report(date, data):
with open(get_date_report_path(date), 'w') as f:
path = get_date_report_path(date)
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
with open(path, 'w') as f:
json.dump(data, f)

async def save_date_report_async(date, data):
path = get_date_report_path(date)
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
async with aiofiles.open(path, 'w') as f:
await f.write(json.dumps(data))

+ 6
- 2
query.py View File

@@ -1,16 +1,20 @@
#!/usr/bin/env python3

import sys
import datetime
import json

import searchlib

if __name__ == '__main__':
startdiff = int(sys.argv[1])
numdays = int(sys.argv[2]) if len(sys.argv) == 3 else startdiff

today = datetime.datetime.now()
tendays = datetime.timedelta(days=18)
tendays = datetime.timedelta(days=startdiff)
startday = today - tendays

searchlib.prep_nltk()
res_tbl = searchlib.query_range(startday, 10)
res_tbl = searchlib.query_range(startday, numdays)

print(json.dumps(res_tbl, indent=' '))

+ 7
- 2
searchlib.py View File

@@ -56,7 +56,7 @@ def process_results(rtbl):

for qdate, rents in rtbl.items():
for rent in rents:
uscheme, unl, upath, uq, ufrag = urllib.parse.urlsplit(rent['u'])
uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])

# Check skip because of domain exclusion
skip_cuz_url = False
@@ -110,7 +110,7 @@ def process_results(rtbl):

# Try to assemble a record to store the thing.
eff_date = rent['nd'] if has_date else qdate
eff_url = urllib.parse.urlunparse([uscheme, unl, upath, uq, ufrag, ''])
eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
item = {
'slug': gen_slug(rent['t'], eff_date),
'url': eff_url,
@@ -166,6 +166,11 @@ def query_range(startdate, numdays, preloadurls=None):

art = None
try:
u = urllib.parse.urlparse(rurl)
if (u.path == '/' or u.path == '') and u.params == '':
print('url is for website main page and has no params, probably not a news article:', rurl)
continue

print('processing', rurl)
a = newspaper.Article(rurl)
a.download()

+ 33
- 0
uploadresults.py View File

@@ -0,0 +1,33 @@
#!/usr/bin/env python3

import os
import sys
import json

import requests

if __name__ == '__main__':
token = os.getenv('SB_ADMIN_KEY')
filename = sys.argv[1]
desturl = sys.argv[2]

days = None
with open(filename, 'r') as f:
days = json.load(f)

h = {
'Authorization': 'Bearer %s' % token,
}

for d, arts in days.items():
print('==== Uploading', d, '...')
for a in arts:
print('uploading', a['u'])

body = {
'date': d,
'desc': a
}

r = requests.post(desturl, headers=h, data=json.dumps(body))
r.raise_for_status()

Loading…
Cancel
Save