feedsearch-crawler: fix for very large RSS feeds (like congressionaldish)
This commit is contained in:
@@ -5,11 +5,16 @@ from feedsearch_crawler import search, sort_urls
|
|||||||
from feedsearch_crawler.crawler import coerce_url
|
from feedsearch_crawler.crawler import coerce_url
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import sys
|
import sys
|
||||||
url, jsonPath = sys.argv[1:]
|
url, jsonPath = sys.argv[1:]
|
||||||
|
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
|
||||||
|
logging.getLogger(__name__).debug("logging enabled")
|
||||||
|
|
||||||
url = coerce_url(url, default_scheme="https")
|
url = coerce_url(url, default_scheme="https")
|
||||||
items = search(url)
|
items = search(url, total_timeout=180, request_timeout=90, max_content_length=100*1024*1024)
|
||||||
items = sort_urls(items)
|
items = sort_urls(items)
|
||||||
|
|
||||||
# print all results
|
# print all results
|
||||||
|
13
pkgs/feedsearch-crawler/0001-response-chunk-size.patch
Normal file
13
pkgs/feedsearch-crawler/0001-response-chunk-size.patch
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
diff --git a/feedsearch_crawler/crawler/request.py b/feedsearch_crawler/crawler/request.py
|
||||||
|
index 70f3d5a..00668ad 100644
|
||||||
|
--- a/feedsearch_crawler/crawler/request.py
|
||||||
|
+++ b/feedsearch_crawler/crawler/request.py
|
||||||
|
@@ -277,7 +277,7 @@ class Request(Queueable):
|
||||||
|
"""
|
||||||
|
body: bytes = b""
|
||||||
|
try:
|
||||||
|
- async for chunk in resp.content.iter_chunked(1024):
|
||||||
|
+ async for chunk in resp.content.iter_chunked(64 * 1024):
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
body += chunk
|
@@ -28,6 +28,12 @@ buildPythonPackage rec {
|
|||||||
hash = "sha256-pzvyeXzqdi8pRjk2+QjKhJfgtxbgVT6C08K9fhVFVmY=";
|
hash = "sha256-pzvyeXzqdi8pRjk2+QjKhJfgtxbgVT6C08K9fhVFVmY=";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
patches = [
|
||||||
|
# fix for <https://github.com/aio-libs/aiohttp/issues/4581>
|
||||||
|
# where large feeds would timeout in an unrecoverable way
|
||||||
|
./0001-response-chunk-size.patch
|
||||||
|
];
|
||||||
|
|
||||||
nativeBuildInputs = [
|
nativeBuildInputs = [
|
||||||
poetry-core
|
poetry-core
|
||||||
];
|
];
|
||||||
|
Reference in New Issue
Block a user