Browse Source

Added most of article submission tooling and tweaked query algo.

master
Trey Del Bonis 2 years ago
parent
commit
c6e24f3f90
7 changed files with 132 additions and 23 deletions
  1. 1
    0
      Pipfile
  2. 34
    7
      Pipfile.lock
  3. 24
    9
      app.py
  4. 27
    3
      inventory.py
  5. 6
    2
      query.py
  6. 7
    2
      searchlib.py
  7. 33
    0
      uploadresults.py

+ 1
- 0
Pipfile View File

jinja2 = "*" jinja2 = "*"
aiofile = "*" aiofile = "*"
httpx = "==0.18" httpx = "==0.18"
requests = "*"


[dev-packages] [dev-packages]



+ 34
- 7
Pipfile.lock View File

{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "7946547c9b8ce1b4406c56e7a25730654b11ee233284a40b4801384c87363fb5"
"sha256": "85a4e859bb95a38f4fde2354fafd837a7bf943f479d50c570be3d686dd1028e9"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
}, },
"anyio": { "anyio": {
"hashes": [ "hashes": [
"sha256:4fd09a25ab7fa01d34512b7249e366cd10358cdafc95022c7ff8c8f8a5026d66",
"sha256:67da67b5b21f96b9d3d65daa6ea99f5d5282cb09f50eb4456f8fb51dffefc3ff"
"sha256:24adc69309fb5779bc1e06158e143e0b6d2c56b302a3ac3de3083c705a6ed39d",
"sha256:2855a9423524abcdd652d942f8932fda1735210f77a6b392eafd9ff34d3fe020"
], ],
"version": "==3.3.4"
"version": "==3.4.0"
}, },
"asgiref": { "asgiref": {
"hashes": [ "hashes": [
}, },
"filelock": { "filelock": {
"hashes": [ "hashes": [
"sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
"sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
"sha256:2e139a228bcf56dd8b2274a65174d005c4a6b68540ee0bdbb92c76f43f29f7e8",
"sha256:93d512b32a23baf4cac44ffd72ccf70732aeff7b8050fcaf6d3ec406d954baf4"
], ],
"version": "==3.3.2"
"version": "==3.4.0"
}, },
"googlesearch-python": { "googlesearch-python": {
"hashes": [ "hashes": [
}, },
"regex": { "regex": {
"hashes": [ "hashes": [
"sha256:0416f7399e918c4b0e074a0f66e5191077ee2ca32a0f99d4c187a62beb47aa05",
"sha256:05b7d6d7e64efe309972adab77fc2af8907bb93217ec60aa9fe12a0dad35874f", "sha256:05b7d6d7e64efe309972adab77fc2af8907bb93217ec60aa9fe12a0dad35874f",
"sha256:0617383e2fe465732af4509e61648b77cbe3aee68b6ac8c0b6fe934db90be5cc", "sha256:0617383e2fe465732af4509e61648b77cbe3aee68b6ac8c0b6fe934db90be5cc",
"sha256:07856afef5ffcc052e7eccf3213317fbb94e4a5cd8177a2caa69c980657b3cb4", "sha256:07856afef5ffcc052e7eccf3213317fbb94e4a5cd8177a2caa69c980657b3cb4",
"sha256:0f594b96fe2e0821d026365f72ac7b4f0b487487fb3d4aaf10dd9d97d88a9737",
"sha256:139a23d1f5d30db2cc6c7fd9c6d6497872a672db22c4ae1910be22d4f4b2068a",
"sha256:162abfd74e88001d20cb73ceaffbfe601469923e875caf9118333b1a4aaafdc4", "sha256:162abfd74e88001d20cb73ceaffbfe601469923e875caf9118333b1a4aaafdc4",
"sha256:2207ae4f64ad3af399e2d30dde66f0b36ae5c3129b52885f1bffc2f05ec505c8", "sha256:2207ae4f64ad3af399e2d30dde66f0b36ae5c3129b52885f1bffc2f05ec505c8",
"sha256:2409b5c9cef7054dde93a9803156b411b677affc84fca69e908b1cb2c540025d",
"sha256:2fee3ed82a011184807d2127f1733b4f6b2ff6ec7151d83ef3477f3b96a13d03",
"sha256:30ab804ea73972049b7a2a5c62d97687d69b5a60a67adca07eb73a0ddbc9e29f", "sha256:30ab804ea73972049b7a2a5c62d97687d69b5a60a67adca07eb73a0ddbc9e29f",
"sha256:3598893bde43091ee5ca0a6ad20f08a0435e93a69255eeb5f81b85e81e329264",
"sha256:3b5df18db1fccd66de15aa59c41e4f853b5df7550723d26aa6cb7f40e5d9da5a", "sha256:3b5df18db1fccd66de15aa59c41e4f853b5df7550723d26aa6cb7f40e5d9da5a",
"sha256:3c5fb32cc6077abad3bbf0323067636d93307c9fa93e072771cf9a64d1c0f3ef", "sha256:3c5fb32cc6077abad3bbf0323067636d93307c9fa93e072771cf9a64d1c0f3ef",
"sha256:416c5f1a188c91e3eb41e9c8787288e707f7d2ebe66e0a6563af280d9b68478f", "sha256:416c5f1a188c91e3eb41e9c8787288e707f7d2ebe66e0a6563af280d9b68478f",
"sha256:42b50fa6666b0d50c30a990527127334d6b96dd969011e843e726a64011485da",
"sha256:432bd15d40ed835a51617521d60d0125867f7b88acf653e4ed994a1f8e4995dc", "sha256:432bd15d40ed835a51617521d60d0125867f7b88acf653e4ed994a1f8e4995dc",
"sha256:473e67837f786404570eae33c3b64a4b9635ae9f00145250851a1292f484c063",
"sha256:4aaa4e0705ef2b73dd8e36eeb4c868f80f8393f5f4d855e94025ce7ad8525f50", "sha256:4aaa4e0705ef2b73dd8e36eeb4c868f80f8393f5f4d855e94025ce7ad8525f50",
"sha256:50a7ddf3d131dc5633dccdb51417e2d1910d25cbcf842115a3a5893509140a3a",
"sha256:529801a0d58809b60b3531ee804d3e3be4b412c94b5d267daa3de7fadef00f49",
"sha256:537ca6a3586931b16a85ac38c08cc48f10fc870a5b25e51794c74df843e9966d", "sha256:537ca6a3586931b16a85ac38c08cc48f10fc870a5b25e51794c74df843e9966d",
"sha256:53db2c6be8a2710b359bfd3d3aa17ba38f8aa72a82309a12ae99d3c0c3dcd74d", "sha256:53db2c6be8a2710b359bfd3d3aa17ba38f8aa72a82309a12ae99d3c0c3dcd74d",
"sha256:5537f71b6d646f7f5f340562ec4c77b6e1c915f8baae822ea0b7e46c1f09b733", "sha256:5537f71b6d646f7f5f340562ec4c77b6e1c915f8baae822ea0b7e46c1f09b733",
"sha256:563d5f9354e15e048465061509403f68424fef37d5add3064038c2511c8f5e00",
"sha256:5d408a642a5484b9b4d11dea15a489ea0928c7e410c7525cd892f4d04f2f617b",
"sha256:61600a7ca4bcf78a96a68a27c2ae9389763b5b94b63943d5158f2a377e09d29a",
"sha256:6650f16365f1924d6014d2ea770bde8555b4a39dc9576abb95e3cd1ff0263b36", "sha256:6650f16365f1924d6014d2ea770bde8555b4a39dc9576abb95e3cd1ff0263b36",
"sha256:666abff54e474d28ff42756d94544cdfd42e2ee97065857413b72e8a2d6a6345", "sha256:666abff54e474d28ff42756d94544cdfd42e2ee97065857413b72e8a2d6a6345",
"sha256:68a067c11463de2a37157930d8b153005085e42bcb7ad9ca562d77ba7d1404e0", "sha256:68a067c11463de2a37157930d8b153005085e42bcb7ad9ca562d77ba7d1404e0",
"sha256:6e1d2cc79e8dae442b3fa4a26c5794428b98f81389af90623ffcc650ce9f6732",
"sha256:74cbeac0451f27d4f50e6e8a8f3a52ca074b5e2da9f7b505c4201a57a8ed6286",
"sha256:780b48456a0f0ba4d390e8b5f7c661fdd218934388cde1a974010a965e200e12", "sha256:780b48456a0f0ba4d390e8b5f7c661fdd218934388cde1a974010a965e200e12",
"sha256:788aef3549f1924d5c38263104dae7395bf020a42776d5ec5ea2b0d3d85d6646", "sha256:788aef3549f1924d5c38263104dae7395bf020a42776d5ec5ea2b0d3d85d6646",
"sha256:7ee1227cf08b6716c85504aebc49ac827eb88fcc6e51564f010f11a406c0a667", "sha256:7ee1227cf08b6716c85504aebc49ac827eb88fcc6e51564f010f11a406c0a667",
"sha256:9345b6f7ee578bad8e475129ed40123d265464c4cfead6c261fd60fc9de00bcf", "sha256:9345b6f7ee578bad8e475129ed40123d265464c4cfead6c261fd60fc9de00bcf",
"sha256:93a5051fcf5fad72de73b96f07d30bc29665697fb8ecdfbc474f3452c78adcf4", "sha256:93a5051fcf5fad72de73b96f07d30bc29665697fb8ecdfbc474f3452c78adcf4",
"sha256:962b9a917dd7ceacbe5cd424556914cb0d636001e393b43dc886ba31d2a1e449", "sha256:962b9a917dd7ceacbe5cd424556914cb0d636001e393b43dc886ba31d2a1e449",
"sha256:96fc32c16ea6d60d3ca7f63397bff5c75c5a562f7db6dec7d412f7c4d2e78ec0",
"sha256:98ba568e8ae26beb726aeea2273053c717641933836568c2a0278a84987b2a1a", "sha256:98ba568e8ae26beb726aeea2273053c717641933836568c2a0278a84987b2a1a",
"sha256:a3feefd5e95871872673b08636f96b61ebef62971eab044f5124fb4dea39919d", "sha256:a3feefd5e95871872673b08636f96b61ebef62971eab044f5124fb4dea39919d",
"sha256:a955b747d620a50408b7fdf948e04359d6e762ff8a85f5775d907ceced715129",
"sha256:b43c2b8a330a490daaef5a47ab114935002b13b3f9dc5da56d5322ff218eeadb", "sha256:b43c2b8a330a490daaef5a47ab114935002b13b3f9dc5da56d5322ff218eeadb",
"sha256:b483c9d00a565633c87abd0aaf27eb5016de23fed952e054ecc19ce32f6a9e7e", "sha256:b483c9d00a565633c87abd0aaf27eb5016de23fed952e054ecc19ce32f6a9e7e",
"sha256:b9ed0b1e5e0759d6b7f8e2f143894b2a7f3edd313f38cf44e1e15d360e11749b",
"sha256:ba05430e819e58544e840a68b03b28b6d328aff2e41579037e8bab7653b37d83", "sha256:ba05430e819e58544e840a68b03b28b6d328aff2e41579037e8bab7653b37d83",
"sha256:ca49e1ab99593438b204e00f3970e7a5f70d045267051dfa6b5f4304fcfa1dbf",
"sha256:ca5f18a75e1256ce07494e245cdb146f5a9267d3c702ebf9b65c7f8bd843431e", "sha256:ca5f18a75e1256ce07494e245cdb146f5a9267d3c702ebf9b65c7f8bd843431e",
"sha256:cd410a1cbb2d297c67d8521759ab2ee3f1d66206d2e4328502a487589a2cb21b",
"sha256:ce298e3d0c65bd03fa65ffcc6db0e2b578e8f626d468db64fdf8457731052942",
"sha256:d5ca078bb666c4a9d1287a379fe617a6dccd18c3e8a7e6c7e1eb8974330c626a", "sha256:d5ca078bb666c4a9d1287a379fe617a6dccd18c3e8a7e6c7e1eb8974330c626a",
"sha256:d5fd67df77bab0d3f4ea1d7afca9ef15c2ee35dfb348c7b57ffb9782a6e4db6e",
"sha256:da1a90c1ddb7531b1d5ff1e171b4ee61f6345119be7351104b67ff413843fe94", "sha256:da1a90c1ddb7531b1d5ff1e171b4ee61f6345119be7351104b67ff413843fe94",
"sha256:dba70f30fd81f8ce6d32ddeef37d91c8948e5d5a4c63242d16a2b2df8143aafc", "sha256:dba70f30fd81f8ce6d32ddeef37d91c8948e5d5a4c63242d16a2b2df8143aafc",
"sha256:dc07f021ee80510f3cd3af2cad5b6a3b3a10b057521d9e6aaeb621730d320c5a",
"sha256:dd33eb9bdcfbabab3459c9ee651d94c842bc8a05fabc95edf4ee0c15a072495e", "sha256:dd33eb9bdcfbabab3459c9ee651d94c842bc8a05fabc95edf4ee0c15a072495e",
"sha256:e0538c43565ee6e703d3a7c3bdfe4037a5209250e8502c98f20fea6f5fdf2965", "sha256:e0538c43565ee6e703d3a7c3bdfe4037a5209250e8502c98f20fea6f5fdf2965",
"sha256:e1f54b9b4b6c53369f40028d2dd07a8c374583417ee6ec0ea304e710a20f80a0", "sha256:e1f54b9b4b6c53369f40028d2dd07a8c374583417ee6ec0ea304e710a20f80a0",
"sha256:e32d2a2b02ccbef10145df9135751abea1f9f076e67a4e261b05f24b94219e36", "sha256:e32d2a2b02ccbef10145df9135751abea1f9f076e67a4e261b05f24b94219e36",
"sha256:e6096b0688e6e14af6a1b10eaad86b4ff17935c49aa774eac7c95a57a4e8c296",
"sha256:e71255ba42567d34a13c03968736c5d39bb4a97ce98188fafb27ce981115beec", "sha256:e71255ba42567d34a13c03968736c5d39bb4a97ce98188fafb27ce981115beec",
"sha256:ed2e07c6a26ed4bea91b897ee2b0835c21716d9a469a96c3e878dc5f8c55bb23", "sha256:ed2e07c6a26ed4bea91b897ee2b0835c21716d9a469a96c3e878dc5f8c55bb23",
"sha256:eef2afb0fd1747f33f1ee3e209bce1ed582d1896b240ccc5e2697e3275f037c7", "sha256:eef2afb0fd1747f33f1ee3e209bce1ed582d1896b240ccc5e2697e3275f037c7",
"sha256:f23222527b307970e383433daec128d769ff778d9b29343fb3496472dc20dabe", "sha256:f23222527b307970e383433daec128d769ff778d9b29343fb3496472dc20dabe",
"sha256:f341ee2df0999bfdf7a95e448075effe0db212a59387de1a70690e4acb03d4c6", "sha256:f341ee2df0999bfdf7a95e448075effe0db212a59387de1a70690e4acb03d4c6",
"sha256:f5be7805e53dafe94d295399cfbe5227f39995a997f4fd8539bf3cbdc8f47ca8",
"sha256:f7f325be2804246a75a4f45c72d4ce80d2443ab815063cdf70ee8fb2ca59ee1b", "sha256:f7f325be2804246a75a4f45c72d4ce80d2443ab815063cdf70ee8fb2ca59ee1b",
"sha256:f8af619e3be812a2059b212064ea7a640aff0568d972cd1b9e920837469eb3cb", "sha256:f8af619e3be812a2059b212064ea7a640aff0568d972cd1b9e920837469eb3cb",
"sha256:fa8c626d6441e2d04b6ee703ef2d1e17608ad44c7cb75258c09dd42bacdfc64b", "sha256:fa8c626d6441e2d04b6ee703ef2d1e17608ad44c7cb75258c09dd42bacdfc64b",
"sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804", "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
"sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e" "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
], ],
"index": "pypi",
"version": "==2.25.1" "version": "==2.25.1"
}, },
"requests-file": { "requests-file": {
}, },
"typing-extensions": { "typing-extensions": {
"hashes": [ "hashes": [
"sha256:2cdf80e4e04866a9b3689a51869016d36db0814d84b8d8a568d22781d45d27ed",
"sha256:829704698b22e13ec9eaf959122315eabb370b0884400e9818334d8b677023d9" "sha256:829704698b22e13ec9eaf959122315eabb370b0884400e9818334d8b677023d9"
], ],
"version": "==4.0.0" "version": "==4.0.0"

+ 24
- 9
app.py View File



from datetime import datetime
from datetime import datetime, timedelta
import json import json
import traceback import traceback


from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware


import reporthook import reporthook
import inventory
import sbenv import sbenv
import searchlib import searchlib




@app.get('/') @app.get('/')
async def render_main(req: Request): async def render_main(req: Request):
raw_articles = await load_days_from_file('testresults.json')
raw_articles = await load_recent_articles()
converted = convert_days_from_articles(raw_articles) converted = convert_days_from_articles(raw_articles)
num_days = calc_num_days(converted) num_days = calc_num_days(converted)


return JSONResponse(status_code=403, content={'error': 'forbidden'}) return JSONResponse(status_code=403, content={'error': 'forbidden'})


body = await req.json() body = await req.json()
add_article(body)
await add_article(body['date'], body['desc'])


return {'status': 'OK'} return {'status': 'OK'}


def add_article(article):
# TODO
pass
async def add_article(datestr, adesc):
date = datetime.strptime(datestr, inventory.DATE_FORMAT)

articles = await inventory.load_date_report_async(date)
articles.append(adesc)
await inventory.save_date_report_async(date, articles)


################################ ################################
# Utilities # Utilities
contents = await f.read() contents = await f.read()
return json.loads(contents) return json.loads(contents)


async def load_recent_articles():
today = datetime.now()
day_dur = timedelta(days=1)
reports = {}

for i in range(MAX_SEARCH_DAYS):
that_day = today - i * day_dur
report = await inventory.load_date_report_async(that_day)
if len(report) > 0:
reports[that_day.strftime(inventory.DATE_FORMAT)] = report

return reports

def convert_days_from_articles(rarts): def convert_days_from_articles(rarts):
processed = searchlib.process_results(rarts) processed = searchlib.process_results(rarts)
output = [] output = []
'slug': a['slug'], 'slug': a['slug'],
} }


DATE_FORMAT = "%Y-%m-%d"

def calc_num_days(dayslist): def calc_num_days(dayslist):
today = datetime.now() today = datetime.now()
lowest = -1 lowest = -1


for d in dayslist: for d in dayslist:
pd = datetime.strptime(d['date'], DATE_FORMAT)
pd = datetime.strptime(d['date'], inventory.DATE_FORMAT)
diff = today - pd diff = today - pd
ndays = diff.days ndays = diff.days
if ndays < lowest or lowest == -1: if ndays < lowest or lowest == -1:

+ 27
- 3
inventory.py View File

import os import os
import json import json


import aiofiles

DATE_FORMAT = "%Y-%m-%d"

def get_datadir(): def get_datadir():
return os.getenv('SB_DATADIR') return os.getenv('SB_DATADIR')


return json.save(data, f) return json.save(data, f)


def get_date_report_path(date): def get_date_report_path(date):
dstr = date.strftime('%Y-%m-%d')
dstr = date.strftime(DATE_FORMAT)
return os.path.join(get_datadir(), 'days', dstr + '.json') return os.path.join(get_datadir(), 'days', dstr + '.json')


def load_date_report(date): def load_date_report(date):
with open(get_date_report_path(date), 'r') as f:
path = get_date_report_path(date)
if not os.path.exists(path):
return []

with open(path, 'r') as f:
return json.load(f) return json.load(f)


async def load_date_report_async(date):
path = get_date_report_path(date)
if not os.path.exists(path):
return []

async with aiofiles.open(path, 'r') as f:
return json.loads(await f.read())

def save_date_report(date, data): def save_date_report(date, data):
with open(get_date_report_path(date), 'w') as f:
path = get_date_report_path(date)
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
with open(path, 'w') as f:
json.dump(data, f) json.dump(data, f)

async def save_date_report_async(date, data):
path = get_date_report_path(date)
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
async with aiofiles.open(path, 'w') as f:
await f.write(json.dumps(data))

+ 6
- 2
query.py View File

#!/usr/bin/env python3 #!/usr/bin/env python3


import sys
import datetime import datetime
import json import json


import searchlib import searchlib


if __name__ == '__main__': if __name__ == '__main__':
startdiff = int(sys.argv[1])
numdays = int(sys.argv[2]) if len(sys.argv) == 3 else startdiff

today = datetime.datetime.now() today = datetime.datetime.now()
tendays = datetime.timedelta(days=18)
tendays = datetime.timedelta(days=startdiff)
startday = today - tendays startday = today - tendays


searchlib.prep_nltk() searchlib.prep_nltk()
res_tbl = searchlib.query_range(startday, 10)
res_tbl = searchlib.query_range(startday, numdays)


print(json.dumps(res_tbl, indent=' ')) print(json.dumps(res_tbl, indent=' '))

+ 7
- 2
searchlib.py View File



for qdate, rents in rtbl.items(): for qdate, rents in rtbl.items():
for rent in rents: for rent in rents:
uscheme, unl, upath, uq, ufrag = urllib.parse.urlsplit(rent['u'])
uscheme, unl, upath, upar, uq, ufrag = urllib.parse.urlparse(rent['u'])


# Check skip because of domain exclusion # Check skip because of domain exclusion
skip_cuz_url = False skip_cuz_url = False


# Try to assemble a record to store the thing. # Try to assemble a record to store the thing.
eff_date = rent['nd'] if has_date else qdate eff_date = rent['nd'] if has_date else qdate
eff_url = urllib.parse.urlunparse([uscheme, unl, upath, uq, ufrag, ''])
eff_url = urllib.parse.urlunparse([uscheme, unl, upath, upar, uq, ''])
item = { item = {
'slug': gen_slug(rent['t'], eff_date), 'slug': gen_slug(rent['t'], eff_date),
'url': eff_url, 'url': eff_url,


art = None art = None
try: try:
u = urllib.parse.urlparse(rurl)
if (u.path == '/' or u.path == '') and u.params == '':
print('url is for website main page and has no params, probably not a news article:', rurl)
continue

print('processing', rurl) print('processing', rurl)
a = newspaper.Article(rurl) a = newspaper.Article(rurl)
a.download() a.download()

+ 33
- 0
uploadresults.py View File

#!/usr/bin/env python3

import os
import sys
import json

import requests

if __name__ == '__main__':
token = os.getenv('SB_ADMIN_KEY')
filename = sys.argv[1]
desturl = sys.argv[2]

days = None
with open(filename, 'r') as f:
days = json.load(f)

h = {
'Authorization': 'Bearer %s' % token,
}

for d, arts in days.items():
print('==== Uploading', d, '...')
for a in arts:
print('uploading', a['u'])

body = {
'date': d,
'desc': a
}

r = requests.post(desturl, headers=h, data=json.dumps(body))
r.raise_for_status()

Loading…
Cancel
Save