Browse Source

Added core search library.

master
Trey Del Bonis 2 years ago
parent
commit
d76a1340bf
6 changed files with 538 additions and 0 deletions
  1. 5
    0
      .gitignore
  2. 13
    0
      Pipfile
  3. 358
    0
      Pipfile.lock
  4. 16
    0
      query.py
  5. 13
    0
      sbenv.py
  6. 133
    0
      searchlib.py

+ 5
- 0
.gitignore View File

@@ -0,0 +1,5 @@
# Emacs
*~

# Python
__pycache__/

+ 13
- 0
Pipfile View File

@@ -0,0 +1,13 @@
[[source]]
url = "https://pypi.python.org/simple"
verify_ssl = true
name = "pypi"

[packages]
googlesearch-python = "==1.0.1"
newspaper3k = "*"

[dev-packages]

[requires]
python_version = "3.9"

+ 358
- 0
Pipfile.lock View File

@@ -0,0 +1,358 @@
{
"_meta": {
"hash": {
"sha256": "1017604161fdc53f7659f00476c87409c771e1e7746532b0475acc7934d7cd00"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.9"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.python.org/simple",
"verify_ssl": true
}
]
},
"default": {
"beautifulsoup4": {
"hashes": [
"sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
"sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
"sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
],
"version": "==4.9.3"
},
"certifi": {
"hashes": [
"sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
"sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
],
"version": "==2021.5.30"
},
"chardet": {
"hashes": [
"sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
"sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
],
"version": "==4.0.0"
},
"click": {
"hashes": [
"sha256:8c04c11192119b1ef78ea049e0a6f0463e4c48ef00a30160c704337586f3ad7a",
"sha256:fba402a4a47334742d782209a7c79bc448911afe1149d07bdabdf480b3e2f4b6"
],
"version": "==8.0.1"
},
"cssselect": {
"hashes": [
"sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf",
"sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"
],
"version": "==1.1.0"
},
"feedfinder2": {
"hashes": [
"sha256:3701ee01a6c85f8b865a049c30ba0b4608858c803fe8e30d1d289fdbe89d0efe"
],
"version": "==0.0.4"
},
"feedparser": {
"hashes": [
"sha256:1b7f57841d9cf85074deb316ed2c795091a238adb79846bc46dccdaf80f9c59a",
"sha256:5ce0410a05ab248c8c7cfca3a0ea2203968ee9ff4486067379af4827a59f9661"
],
"version": "==6.0.8"
},
"filelock": {
"hashes": [
"sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59",
"sha256:929b7d63ec5b7d6b71b0fa5ac14e030b3f70b75747cef1b10da9b879fef15836"
],
"version": "==3.0.12"
},
"googlesearch-python": {
"hashes": [
"sha256:1e1286941a1ea029de4cf7b56a5151ef85324e288e9b41c2c3b81bb945b44427",
"sha256:4392d99ec0bff58eb2b6074f34559462c5b99a31886d48d81e3ed414163e281b"
],
"index": "pypi",
"version": "==1.0.1"
},
"idna": {
"hashes": [
"sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6",
"sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"
],
"version": "==2.10"
},
"jieba3k": {
"hashes": [
"sha256:980a4f2636b778d312518066be90c7697d410dd5a472385f5afced71a2db1c10"
],
"version": "==0.35.1"
},
"joblib": {
"hashes": [
"sha256:9c17567692206d2f3fb9ecf5e991084254fe631665c450b443761c4186a613f7",
"sha256:feeb1ec69c4d45129954f1b7034954241eedfd6ba39b5e9e4b6883be3332d5e5"
],
"version": "==1.0.1"
},
"lxml": {
"hashes": [
"sha256:079f3ae844f38982d156efce585bc540c16a926d4436712cf4baee0cce487a3d",
"sha256:0fbcf5565ac01dff87cbfc0ff323515c823081c5777a9fc7703ff58388c258c3",
"sha256:122fba10466c7bd4178b07dba427aa516286b846b2cbd6f6169141917283aae2",
"sha256:1b38116b6e628118dea5b2186ee6820ab138dbb1e24a13e478490c7db2f326ae",
"sha256:1b7584d421d254ab86d4f0b13ec662a9014397678a7c4265a02a6d7c2b18a75f",
"sha256:26e761ab5b07adf5f555ee82fb4bfc35bf93750499c6c7614bd64d12aaa67927",
"sha256:289e9ca1a9287f08daaf796d96e06cb2bc2958891d7911ac7cae1c5f9e1e0ee3",
"sha256:2a9d50e69aac3ebee695424f7dbd7b8c6d6eb7de2a2eb6b0f6c7db6aa41e02b7",
"sha256:3082c518be8e97324390614dacd041bb1358c882d77108ca1957ba47738d9d59",
"sha256:33bb934a044cf32157c12bfcfbb6649807da20aa92c062ef51903415c704704f",
"sha256:3439c71103ef0e904ea0a1901611863e51f50b5cd5e8654a151740fde5e1cade",
"sha256:36108c73739985979bf302006527cf8a20515ce444ba916281d1c43938b8bb96",
"sha256:39b78571b3b30645ac77b95f7c69d1bffc4cf8c3b157c435a34da72e78c82468",
"sha256:4289728b5e2000a4ad4ab8da6e1db2e093c63c08bdc0414799ee776a3f78da4b",
"sha256:4bff24dfeea62f2e56f5bab929b4428ae6caba2d1eea0c2d6eb618e30a71e6d4",
"sha256:4c61b3a0db43a1607d6264166b230438f85bfed02e8cff20c22e564d0faff354",
"sha256:542d454665a3e277f76954418124d67516c5f88e51a900365ed54a9806122b83",
"sha256:5a0a14e264069c03e46f926be0d8919f4105c1623d620e7ec0e612a2e9bf1c04",
"sha256:5c8c163396cc0df3fd151b927e74f6e4acd67160d6c33304e805b84293351d16",
"sha256:64812391546a18896adaa86c77c59a4998f33c24788cadc35789e55b727a37f4",
"sha256:66e575c62792c3f9ca47cb8b6fab9e35bab91360c783d1606f758761810c9791",
"sha256:6f12e1427285008fd32a6025e38e977d44d6382cf28e7201ed10d6c1698d2a9a",
"sha256:74f7d8d439b18fa4c385f3f5dfd11144bb87c1da034a466c5b5577d23a1d9b51",
"sha256:7610b8c31688f0b1be0ef882889817939490a36d0ee880ea562a4e1399c447a1",
"sha256:76fa7b1362d19f8fbd3e75fe2fb7c79359b0af8747e6f7141c338f0bee2f871a",
"sha256:7728e05c35412ba36d3e9795ae8995e3c86958179c9770e65558ec3fdfd3724f",
"sha256:8157dadbb09a34a6bd95a50690595e1fa0af1a99445e2744110e3dca7831c4ee",
"sha256:820628b7b3135403540202e60551e741f9b6d3304371712521be939470b454ec",
"sha256:884ab9b29feaca361f7f88d811b1eea9bfca36cf3da27768d28ad45c3ee6f969",
"sha256:89b8b22a5ff72d89d48d0e62abb14340d9e99fd637d046c27b8b257a01ffbe28",
"sha256:92e821e43ad382332eade6812e298dc9701c75fe289f2a2d39c7960b43d1e92a",
"sha256:b007cbb845b28db4fb8b6a5cdcbf65bacb16a8bd328b53cbc0698688a68e1caa",
"sha256:bc4313cbeb0e7a416a488d72f9680fffffc645f8a838bd2193809881c67dd106",
"sha256:bccbfc27563652de7dc9bdc595cb25e90b59c5f8e23e806ed0fd623755b6565d",
"sha256:c1a40c06fd5ba37ad39caa0b3144eb3772e813b5fb5b084198a985431c2f1e8d",
"sha256:c47ff7e0a36d4efac9fd692cfa33fbd0636674c102e9e8d9b26e1b93a94e7617",
"sha256:c4f05c5a7c49d2fb70223d0d5bcfbe474cf928310ac9fa6a7c6dddc831d0b1d4",
"sha256:cdaf11d2bd275bf391b5308f86731e5194a21af45fbaaaf1d9e8147b9160ea92",
"sha256:ce256aaa50f6cc9a649c51be3cd4ff142d67295bfc4f490c9134d0f9f6d58ef0",
"sha256:d2e35d7bf1c1ac8c538f88d26b396e73dd81440d59c1ef8522e1ea77b345ede4",
"sha256:d916d31fd85b2f78c76400d625076d9124de3e4bda8b016d25a050cc7d603f24",
"sha256:df7c53783a46febb0e70f6b05df2ba104610f2fb0d27023409734a3ecbb78fb2",
"sha256:e1cbd3f19a61e27e011e02f9600837b921ac661f0c40560eefb366e4e4fb275e",
"sha256:efac139c3f0bf4f0939f9375af4b02c5ad83a622de52d6dfa8e438e8e01d0eb0",
"sha256:efd7a09678fd8b53117f6bae4fa3825e0a22b03ef0a932e070c0bdbb3a35e654",
"sha256:f2380a6376dfa090227b663f9678150ef27543483055cc327555fb592c5967e2",
"sha256:f8380c03e45cf09f8557bdaa41e1fa7c81f3ae22828e1db470ab2a6c96d8bc23",
"sha256:f90ba11136bfdd25cae3951af8da2e95121c9b9b93727b1b896e3fa105b2f586"
],
"version": "==4.6.3"
},
"newspaper3k": {
"hashes": [
"sha256:44a864222633d3081113d1030615991c3dbba87239f6bbf59d91240f71a22e3e",
"sha256:9f1bd3e1fb48f400c715abf875cc7b0a67b7ddcd87f50c9aeeb8fcbbbd9004fb"
],
"index": "pypi",
"version": "==0.2.8"
},
"nltk": {
"hashes": [
"sha256:240e23ab1ab159ef9940777d30c7c72d7e76d91877099218a7585370c11f6b9e",
"sha256:57d556abed621ab9be225cc6d2df1edce17572efb67a3d754630c9f8381503eb"
],
"version": "==3.6.2"
},
"pillow": {
"hashes": [
"sha256:0b2efa07f69dc395d95bb9ef3299f4ca29bcb2157dc615bae0b42c3c20668ffc",
"sha256:114f816e4f73f9ec06997b2fde81a92cbf0777c9e8f462005550eed6bae57e63",
"sha256:147bd9e71fb9dcf08357b4d530b5167941e222a6fd21f869c7911bac40b9994d",
"sha256:15a2808e269a1cf2131930183dcc0419bc77bb73eb54285dde2706ac9939fa8e",
"sha256:196560dba4da7a72c5e7085fccc5938ab4075fd37fe8b5468869724109812edd",
"sha256:1c03e24be975e2afe70dfc5da6f187eea0b49a68bb2b69db0f30a61b7031cee4",
"sha256:1fd5066cd343b5db88c048d971994e56b296868766e461b82fa4e22498f34d77",
"sha256:29c9569049d04aaacd690573a0398dbd8e0bf0255684fee512b413c2142ab723",
"sha256:2b6dfa068a8b6137da34a4936f5a816aba0ecc967af2feeb32c4393ddd671cba",
"sha256:2cac53839bfc5cece8fdbe7f084d5e3ee61e1303cccc86511d351adcb9e2c792",
"sha256:2ee77c14a0299d0541d26f3d8500bb57e081233e3fa915fa35abd02c51fa7fae",
"sha256:37730f6e68bdc6a3f02d2079c34c532330d206429f3cee651aab6b66839a9f0e",
"sha256:3f08bd8d785204149b5b33e3b5f0ebbfe2190ea58d1a051c578e29e39bfd2367",
"sha256:479ab11cbd69612acefa8286481f65c5dece2002ffaa4f9db62682379ca3bb77",
"sha256:4bc3c7ef940eeb200ca65bd83005eb3aae8083d47e8fcbf5f0943baa50726856",
"sha256:660a87085925c61a0dcc80efb967512ac34dbb256ff7dd2b9b4ee8dbdab58cf4",
"sha256:67b3666b544b953a2777cb3f5a922e991be73ab32635666ee72e05876b8a92de",
"sha256:70af7d222df0ff81a2da601fab42decb009dc721545ed78549cb96e3a1c5f0c8",
"sha256:75e09042a3b39e0ea61ce37e941221313d51a9c26b8e54e12b3ececccb71718a",
"sha256:8960a8a9f4598974e4c2aeb1bff9bdd5db03ee65fd1fce8adf3223721aa2a636",
"sha256:9364c81b252d8348e9cc0cb63e856b8f7c1b340caba6ee7a7a65c968312f7dab",
"sha256:969cc558cca859cadf24f890fc009e1bce7d7d0386ba7c0478641a60199adf79",
"sha256:9a211b663cf2314edbdb4cf897beeb5c9ee3810d1d53f0e423f06d6ebbf9cd5d",
"sha256:a17ca41f45cf78c2216ebfab03add7cc350c305c38ff34ef4eef66b7d76c5229",
"sha256:a2f381932dca2cf775811a008aa3027671ace723b7a38838045b1aee8669fdcf",
"sha256:a4eef1ff2d62676deabf076f963eda4da34b51bc0517c70239fafed1d5b51500",
"sha256:c088a000dfdd88c184cc7271bfac8c5b82d9efa8637cd2b68183771e3cf56f04",
"sha256:c0e0550a404c69aab1e04ae89cca3e2a042b56ab043f7f729d984bf73ed2a093",
"sha256:c11003197f908878164f0e6da15fce22373ac3fc320cda8c9d16e6bba105b844",
"sha256:c2a5ff58751670292b406b9f06e07ed1446a4b13ffced6b6cab75b857485cbc8",
"sha256:c35d09db702f4185ba22bb33ef1751ad49c266534339a5cebeb5159d364f6f82",
"sha256:c379425c2707078dfb6bfad2430728831d399dc95a7deeb92015eb4c92345eaf",
"sha256:cc866706d56bd3a7dbf8bac8660c6f6462f2f2b8a49add2ba617bc0c54473d83",
"sha256:d0da39795049a9afcaadec532e7b669b5ebbb2a9134576ebcc15dd5bdae33cc0",
"sha256:f156d6ecfc747ee111c167f8faf5f4953761b5e66e91a4e6767e548d0f80129c",
"sha256:f4ebde71785f8bceb39dcd1e7f06bcc5d5c3cf48b9f69ab52636309387b097c8",
"sha256:fc214a6b75d2e0ea7745488da7da3c381f41790812988c7a92345978414fad37",
"sha256:fd7eef578f5b2200d066db1b50c4aa66410786201669fb76d5238b007918fb24",
"sha256:ff04c373477723430dce2e9d024c708a047d44cf17166bf16e604b379bf0ca14"
],
"version": "==8.3.1"
},
"python-dateutil": {
"hashes": [
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
"sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
],
"version": "==2.8.2"
},
"pyyaml": {
"hashes": [
"sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf",
"sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696",
"sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393",
"sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77",
"sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922",
"sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5",
"sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8",
"sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10",
"sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc",
"sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018",
"sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e",
"sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253",
"sha256:72a01f726a9c7851ca9bfad6fd09ca4e090a023c00945ea05ba1638c09dc3347",
"sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183",
"sha256:895f61ef02e8fed38159bb70f7e100e00f471eae2bc838cd0f4ebb21e28f8541",
"sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb",
"sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185",
"sha256:bfb51918d4ff3d77c1c856a9699f8492c612cde32fd3bcd344af9be34999bfdc",
"sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db",
"sha256:cb333c16912324fd5f769fff6bc5de372e9e7a202247b48870bc251ed40239aa",
"sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46",
"sha256:d483ad4e639292c90170eb6f7783ad19490e7a8defb3e46f97dfe4bacae89122",
"sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b",
"sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63",
"sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df",
"sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc",
"sha256:fd7f6999a8070df521b6384004ef42833b9bd62cfee11a09bda1079b4b704247",
"sha256:fdc842473cd33f45ff6bce46aea678a54e3d21f1b61a7750ce3c498eedfe25d6",
"sha256:fe69978f3f768926cfa37b867e3843918e012cf83f680806599ddce33c2c68b0"
],
"version": "==5.4.1"
},
"regex": {
"hashes": [
"sha256:026beb631097a4a3def7299aa5825e05e057de3c6d72b139c37813bfa351274b",
"sha256:14caacd1853e40103f59571f169704367e79fb78fac3d6d09ac84d9197cadd16",
"sha256:16d9eaa8c7e91537516c20da37db975f09ac2e7772a0694b245076c6d68f85da",
"sha256:18fdc51458abc0a974822333bd3a932d4e06ba2a3243e9a1da305668bd62ec6d",
"sha256:28e8af338240b6f39713a34e337c3813047896ace09d51593d6907c66c0708ba",
"sha256:3835de96524a7b6869a6c710b26c90e94558c31006e96ca3cf6af6751b27dca1",
"sha256:3905c86cc4ab6d71635d6419a6f8d972cab7c634539bba6053c47354fd04452c",
"sha256:3c09d88a07483231119f5017904db8f60ad67906efac3f1baa31b9b7f7cca281",
"sha256:4551728b767f35f86b8e5ec19a363df87450c7376d7419c3cac5b9ceb4bce576",
"sha256:459bbe342c5b2dec5c5223e7c363f291558bc27982ef39ffd6569e8c082bdc83",
"sha256:4f421e3cdd3a273bace013751c345f4ebeef08f05e8c10757533ada360b51a39",
"sha256:577737ec3d4c195c4aef01b757905779a9e9aee608fa1cf0aec16b5576c893d3",
"sha256:57fece29f7cc55d882fe282d9de52f2f522bb85290555b49394102f3621751ee",
"sha256:7976d410e42be9ae7458c1816a416218364e06e162b82e42f7060737e711d9ce",
"sha256:85f568892422a0e96235eb8ea6c5a41c8ccbf55576a2260c0160800dbd7c4f20",
"sha256:8764a78c5464ac6bde91a8c87dd718c27c1cabb7ed2b4beaf36d3e8e390567f9",
"sha256:8935937dad2c9b369c3d932b0edbc52a62647c2afb2fafc0c280f14a8bf56a6a",
"sha256:8fe58d9f6e3d1abf690174fd75800fda9bdc23d2a287e77758dc0e8567e38ce6",
"sha256:937b20955806381e08e54bd9d71f83276d1f883264808521b70b33d98e4dec5d",
"sha256:9569da9e78f0947b249370cb8fadf1015a193c359e7e442ac9ecc585d937f08d",
"sha256:a3b73390511edd2db2d34ff09aa0b2c08be974c71b4c0505b4a048d5dc128c2b",
"sha256:a4eddbe2a715b2dd3849afbdeacf1cc283160b24e09baf64fa5675f51940419d",
"sha256:a5c6dbe09aff091adfa8c7cfc1a0e83fdb8021ddb2c183512775a14f1435fe16",
"sha256:b63e3571b24a7959017573b6455e05b675050bbbea69408f35f3cb984ec54363",
"sha256:bb350eb1060591d8e89d6bac4713d41006cd4d479f5e11db334a48ff8999512f",
"sha256:bf6d987edd4a44dd2fa2723fca2790f9442ae4de2c8438e53fcb1befdf5d823a",
"sha256:bfa6a679410b394600eafd16336b2ce8de43e9b13f7fb9247d84ef5ad2b45e91",
"sha256:c856ec9b42e5af4fe2d8e75970fcc3a2c15925cbcc6e7a9bcb44583b10b95e80",
"sha256:cea56288eeda8b7511d507bbe7790d89ae7049daa5f51ae31a35ae3c05408531",
"sha256:ea212df6e5d3f60341aef46401d32fcfded85593af1d82b8b4a7a68cd67fdd6b",
"sha256:f35567470ee6dbfb946f069ed5f5615b40edcbb5f1e6e1d3d2b114468d505fc6",
"sha256:fbc20975eee093efa2071de80df7f972b7b35e560b213aafabcec7c0bd00bd8c",
"sha256:ff4a8ad9638b7ca52313d8732f37ecd5fd3c8e3aff10a8ccb93176fd5b3812f6"
],
"version": "==2021.8.3"
},
"requests": {
"hashes": [
"sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
"sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
],
"version": "==2.25.1"
},
"requests-file": {
"hashes": [
"sha256:07d74208d3389d01c38ab89ef403af0cfec63957d53a0081d8eca738d0247d8e",
"sha256:dfe5dae75c12481f68ba353183c53a65e6044c923e64c24b2209f6c7570ca953"
],
"version": "==1.5.1"
},
"sgmllib3k": {
"hashes": [
"sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"
],
"version": "==1.0.0"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
"version": "==1.16.0"
},
"soupsieve": {
"hashes": [
"sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc",
"sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b"
],
"markers": "python_version >= '3.0'",
"version": "==2.2.1"
},
"tinysegmenter": {
"hashes": [
"sha256:ed1f6d2e806a4758a73be589754384cbadadc7e1a414c81a166fc9adf2d40c6d"
],
"version": "==0.3"
},
"tldextract": {
"hashes": [
"sha256:cfae9bc8bda37c3e8c7c8639711ad20e95dc85b207a256b60b0b23d7ff5540ea",
"sha256:e57f22b6d00a28c21673d2048112f1bdcb6a14d4711568305f6bb96cf5bb53a1"
],
"version": "==3.1.0"
},
"tqdm": {
"hashes": [
"sha256:07856e19a1fe4d2d9621b539d3f072fa88c9c1ef1f3b7dd4d4953383134c3164",
"sha256:35540feeaca9ac40c304e916729e6b78045cbbeccd3e941b2868f09306798ac9"
],
"version": "==4.62.1"
},
"urllib3": {
"hashes": [
"sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
"sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
],
"version": "==1.26.6"
}
},
"develop": {}
}

+ 16
- 0
query.py View File

@@ -0,0 +1,16 @@
#!/usr/bin/env python3

import datetime
import json

import searchlib

if __name__ == '__main__':
today = datetime.datetime.now()
tendays = datetime.timedelta(days=18)
startday = today - tendays

searchlib.prep_nltk()
res_tbl = searchlib.query_range(startday, 10)

print(json.dumps(res_tbl, indent=' '))

+ 13
- 0
sbenv.py View File

@@ -0,0 +1,13 @@
import os

import http.cookiejar as cookielib

def load_cookiejar():
cpath = os.getenv('SB_COOKIES_TXT')
print('using cookies', cpath)
cj = cookielib.MozillaCookieJar(cpath)
cj.load()
return cj

def get_google_abuse_token():
return os.getenv('SB_GOOGABUSE')

+ 133
- 0
searchlib.py View File

@@ -0,0 +1,133 @@
import os
import datetime
import urllib

import http.cookiejar as cookielib
import newspaper
import nltk
from bs4 import BeautifulSoup
from requests import get

from sbenv import *

EXCL_DOMAINS = [
'(www\.)?researchgate\.net',
'(www\.)?businessyab.com'
]

# silly google
REWRITE_DOMAINS = [
('.*\.facebook\.com', 'www.facebook.com')
]

# silly google
EXCL_PATH_EXTS = [
'.pdf',
'.docx'
]

# If a page has any of these keywords then we drop it.
EXCL_KWS = [
# for some reason it shows personal injury lawyers
'injury',
'attorney',
'court',
'claim',

# repair shops
'repair',
]

# If it has any of these then it's a hard confirm
PASS_KW = [
'storrow',
'storrowed',
'overpass',
'bridge',
]

def query_range(startdate, numdays):
cookiejar = load_cookiejar()

oneday = datetime.timedelta(days=1)
seenurls = set()

dateurls = {}

for i in range(numdays):
d = startdate + (oneday * i)
print(d)
qresults = query_for_date(d, cookiejar)

dayresults = []
for rurl, rtitle in qresults:
if rurl in seenurls:
continue
seenurls.add(rurl)

rent = {
'u': rurl,
't': rtitle,
}

art = None
try:
print('processing', rurl)
a = newspaper.Article(rurl)
a.download()
a.parse()
a.nlp()

rent['nt'] = a.title
try:
rent['nd'] = a.publish_date.strftime('%Y-%m-%d')
except:
pass
rent['nkw'] = a.keywords

except Exception as e:
print(str(e))

dayresults.append(rent)

dateurls[d.strftime('%Y-%m-%d')] = dayresults

return dateurls

_query_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

def query_for_date(ondate, cookiejar):
u = make_search_url(ondate)
resp = get(u, headers=_query_headers, cookies=cookiejar)
resp.raise_for_status()
return list(parse_results(resp.text))

def make_search_url(ondate):
params = {
'q': '"storrow drive" truck accident bridge OR overpass',
'hl': 'en',
'tbs': create_range_param(ondate),
'google_abuse': get_google_abuse_token()
}

query_part = urllib.parse.urlencode(params)
return 'https://www.google.com/search?' + query_part

def create_range_param(ondate):
datestr = ondate.strftime('%m/%d/%Y').lstrip('0').replace(' 0', ' ')
return 'cdr:1,cd_min:%s,cd_max:%s' % (datestr, datestr)

def parse_results(raw_html):
soup = BeautifulSoup(raw_html, 'html.parser')
result_block = soup.find_all('div', attrs={'class': 'g'})
for result in result_block:
link = result.find('a', href=True)
title = result.find('h3')
if link and title:
yield (link['href'], title.text)

def prep_nltk():
nltk.download('punkt')


Loading…
Cancel
Save