feedsearch-crawler: fix for very large RSS feeds (like congressionaldish)

2023-01-25 09:44:32 +00:00
parent 33d7819619
commit 1909e0fbe8
3 changed files with 25 additions and 1 deletions
--- a/pkgs/feeds/update.py
+++ b/pkgs/feeds/update.py
@@ -5,11 +5,16 @@ from feedsearch_crawler import search, sort_urls
 from feedsearch_crawler.crawler import coerce_url

 import json
+import logging
 import sys
 url, jsonPath = sys.argv[1:]

+logging.getLogger().setLevel(logging.DEBUG)
+logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
+logging.getLogger(__name__).debug("logging enabled")
+
 url = coerce_url(url, default_scheme="https")
-items = search(url)
+items = search(url, total_timeout=180, request_timeout=90, max_content_length=100*1024*1024)
 items = sort_urls(items)

 # print all results
--- a/pkgs/feedsearch-crawler/0001-response-chunk-size.patch
+++ b/pkgs/feedsearch-crawler/0001-response-chunk-size.patch
@@ -0,0 +1,13 @@
+diff --git a/feedsearch_crawler/crawler/request.py b/feedsearch_crawler/crawler/request.py
+index 70f3d5a..00668ad 100644
+--- a/feedsearch_crawler/crawler/request.py
+++ b/feedsearch_crawler/crawler/request.py
+@@ -277,7 +277,7 @@ class Request(Queueable):
+         """
+         body: bytes = b""
+         try:
+-            async for chunk in resp.content.iter_chunked(1024):
+            async for chunk in resp.content.iter_chunked(64 * 1024):
+                 if not chunk:
+                     break
+                 body += chunk
--- a/pkgs/feedsearch-crawler/default.nix
+++ b/pkgs/feedsearch-crawler/default.nix
@@ -28,6 +28,12 @@ buildPythonPackage rec {
    hash = "sha256-pzvyeXzqdi8pRjk2+QjKhJfgtxbgVT6C08K9fhVFVmY=";
  };

+  patches = [
+    # fix for <https://github.com/aio-libs/aiohttp/issues/4581>
+    #   where large feeds would timeout in an unrecoverable way
+    ./0001-response-chunk-size.patch
+  ];
+
  nativeBuildInputs = [
    poetry-core
  ];