feeds/update-feed: refactor

This commit is contained in:
2025-04-28 02:45:50 +00:00
parent 11be114e1d
commit 7319bd1528

View File

@@ -1,59 +1,131 @@
#!/usr/bin/env nix-shell
#!nix-shell -i python3 -p feedsearch-crawler -p python3
#!nix-shell -i python3 -p feedsearch-crawler -p podcastindex-db -p python3
from feedsearch_crawler import search, sort_urls
from feedsearch_crawler.crawler import coerce_url
from dataclasses import dataclass
from typing import Any, Iterator
import argparse
import feedsearch_crawler as fsc
import json
import logging
import sys
logger = logging.getLogger(__name__)
def try_scheme(url: str, scheme: str):
url = coerce_url(url, default_scheme=scheme)
print(f"trying {url}")
items = search(url, total_timeout=180, request_timeout=90, max_content_length=100*1024*1024)
return sort_urls(items)
@dataclass(order=True)
class Feed:
# content_type
description: str | None = None # not used
# favicon
# favicon_data_uri # embedded favicon
# hubs # PubSub hubs
is_podcast: bool | None = None # used by <hosts/common/feeds.nix>
# is_push
# last_updated
# self_url
site_name: str | None = None # not used
site_url: str | None = None # not used
title: str | None = None # used by <hosts/common/feeds.nix> (and others)
url: str | None = None # used by <hosts/common/feeds.nix> (and many others)
velocity: float | None = None # used by <hosts/common/feeds.nix>
# version
def clean_item(item: dict) -> dict:
''' remove keys/values i don't care to keep in git '''
item = {
k:v for k,v in item.items() if k in [
# "content_type",
"description", # not used
# "favicon",
# "favicon_data_uri", # embedded favicon
# "hubs", # PubSub hubs
"is_podcast", # used by <hosts/common/feeds.nix>
# "is_push",
# "last_updated",
# "self_url",
"site_name", # not used
"site_url", # not used
"title", # used by <hosts/common/feeds.nix> (and others)
"url", # used by <hosts/common/feeds.nix> (and many others)
"velocity", # used by <hosts/common/feeds.nix>
# "version",
] and item[k] is not None
}
# clean up characters for better printability
for k in "title", "site_name", "description":
if k not in item: continue
item[k] = item[k] \
.replace("\u2013", "-") \
.replace("\u2019", "'") \
def __post_init__(self) -> None:
def clean(value: str | None) -> str | None:
replacements = {
"\u2013": "-",
"\u2019": "'",
}
return item
if value is None:
return None
def try_load_existing_feed(path_: str) -> dict:
for from_, to in replacements.items():
value = value.replace(from_, to)
return value
# clean up characters for better printability
self.title = clean(self.title)
self.site_name = clean(self.site_name)
self.description = clean(self.description)
def from_dict(d: dict[str, Any]) -> 'Self':
"""
populate this feed from a dict,
setting non-provided fields to None,
and ignoring and extra fields
"""
return Feed(
description = d.get("description"),
is_podcast = d.get("is_podcast"),
site_name = d.get("site_name"),
site_url = d.get("site_url"),
title = d.get("title"),
url = d.get("url"),
velocity = d.get("velocity"),
)
def to_dict(self) -> dict[str, Any]:
return dict(
description=self.description,
is_podcast=self.is_podcast,
site_name=self.site_name,
site_url=self.site_url,
title=self.title,
url=self.url,
velocity=self.velocity,
)
def to_json(self) -> str:
return json.dumps(self.to_dict(), sort_keys=True, indent=2)
class Locator:
def __init__(self, url: str, prefer_podcast: bool) -> None:
self.url = url
self.prefer_podcast = prefer_podcast
self.feeds = []
def locate_best(self, feeds: list[Feed] | None=None) -> Feed | None:
if feeds is None:
feeds = list(self.locate_all())
feeds = sorted(feeds, key=lambda f: (
(f.is_podcast == self.prefer_podcast),
-f.velocity if f.velocity is not None else 0, #< prefer higher-velocity sources
f.url, #< prefer shorter URLs
len(f.title) if f.title is not None else 1000, #< prefer shorter titles
f, #< tie-breaker
))
return feeds[0] if len(feeds) else None
def locate_all(self) -> Iterator[Feed]:
uris = [
f"https://{self.url}",
f"http://{self.url}",
f"https://www.{self.url}",
f"http://www.{self.url}",
]
seen = []
for uri in uris:
for feed in self.locate_feedsearch(uri):
if feed not in seen:
seen.append(feed)
yield feed
def locate_feedsearch(self, uri: str) -> Iterator[Feed]:
scheme, _separator, url = uri.partition("://")
assert scheme and url, f"failed to partition ${uri!r}"
url = fsc.crawler.coerce_url(url, default_scheme=scheme)
print(f"trying {url}")
feeds = fsc.search(url, total_timeout=180, request_timeout=90, max_content_length=100*1024*1024)
return (Feed.from_dict(i.serialize()) for i in fsc.sort_urls(feeds))
def try_load_existing_feed(path_: str) -> Feed:
try:
f = open(path_, "r")
except:
return {}
return Feed()
else:
return json.loads(f.read())
return Feed.from_dict(json.loads(f.read()))
def select_feed(feeds: list[dict], prefer_podcast: bool) -> dict:
feeds = sorted(feeds, key=lambda f: (
@@ -80,25 +152,17 @@ def main():
url, json_path = args.url, args.output
existing_data = try_load_existing_feed(json_path)
prefer_podcast = args.podcast or (existing_data.is_podcast or False)
locator = Locator(url, prefer_podcast=prefer_podcast)
prefer_podcast = args.podcast or existing_data.get("is_podcast", False)
items = try_scheme(url, "https") \
or try_scheme(url, "http") \
or try_scheme(f"www.{url}", "https") \
or try_scheme(f"www.{url}", "http") \
# print all results
serialized = [item.serialize() for item in items]
serialized = [clean_item(s) for s in serialized]
for item in serialized:
print(json.dumps(item, sort_keys=True, indent=2))
all_feeds = list(locator.locate_all())
for feed in all_feeds:
print(feed.to_json())
# save the best feed to disk
keep = select_feed(serialized, prefer_podcast=prefer_podcast)
results = json.dumps(keep, sort_keys=True, indent=2)
keep = locator.locate_best(all_feeds)
with open(json_path, "w") as out:
out.write(results)
out.write(keep.to_json())
if __name__ == '__main__':
main()