feedsearch-crawler: fix for very large RSS feeds (like congressionaldish)

This commit is contained in:
2023-01-25 09:44:32 +00:00
parent 33d7819619
commit 1909e0fbe8
3 changed files with 25 additions and 1 deletions

View File

@@ -5,11 +5,16 @@ from feedsearch_crawler import search, sort_urls
from feedsearch_crawler.crawler import coerce_url
import json
import logging
import sys
url, jsonPath = sys.argv[1:]
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
logging.getLogger(__name__).debug("logging enabled")
url = coerce_url(url, default_scheme="https")
items = search(url)
items = search(url, total_timeout=180, request_timeout=90, max_content_length=100*1024*1024)
items = sort_urls(items)
# print all results

View File

@@ -0,0 +1,13 @@
diff --git a/feedsearch_crawler/crawler/request.py b/feedsearch_crawler/crawler/request.py
index 70f3d5a..00668ad 100644
--- a/feedsearch_crawler/crawler/request.py
+++ b/feedsearch_crawler/crawler/request.py
@@ -277,7 +277,7 @@ class Request(Queueable):
"""
body: bytes = b""
try:
- async for chunk in resp.content.iter_chunked(1024):
+ async for chunk in resp.content.iter_chunked(64 * 1024):
if not chunk:
break
body += chunk

View File

@@ -28,6 +28,12 @@ buildPythonPackage rec {
hash = "sha256-pzvyeXzqdi8pRjk2+QjKhJfgtxbgVT6C08K9fhVFVmY=";
};
patches = [
# fix for <https://github.com/aio-libs/aiohttp/issues/4581>
# where large feeds would timeout in an unrecoverable way
./0001-response-chunk-size.patch
];
nativeBuildInputs = [
poetry-core
];