feeds/update-feed: refactor
This commit is contained in:
@@ -1,59 +1,131 @@
|
||||
#!/usr/bin/env nix-shell
|
||||
#!nix-shell -i python3 -p feedsearch-crawler -p python3
|
||||
#!nix-shell -i python3 -p feedsearch-crawler -p podcastindex-db -p python3
|
||||
|
||||
from feedsearch_crawler import search, sort_urls
|
||||
from feedsearch_crawler.crawler import coerce_url
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Iterator
|
||||
|
||||
import argparse
|
||||
import feedsearch_crawler as fsc
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def try_scheme(url: str, scheme: str):
|
||||
url = coerce_url(url, default_scheme=scheme)
|
||||
print(f"trying {url}")
|
||||
items = search(url, total_timeout=180, request_timeout=90, max_content_length=100*1024*1024)
|
||||
return sort_urls(items)
|
||||
@dataclass(order=True)
|
||||
class Feed:
|
||||
# content_type
|
||||
description: str | None = None # not used
|
||||
# favicon
|
||||
# favicon_data_uri # embedded favicon
|
||||
# hubs # PubSub hubs
|
||||
is_podcast: bool | None = None # used by <hosts/common/feeds.nix>
|
||||
# is_push
|
||||
# last_updated
|
||||
# self_url
|
||||
site_name: str | None = None # not used
|
||||
site_url: str | None = None # not used
|
||||
title: str | None = None # used by <hosts/common/feeds.nix> (and others)
|
||||
url: str | None = None # used by <hosts/common/feeds.nix> (and many others)
|
||||
velocity: float | None = None # used by <hosts/common/feeds.nix>
|
||||
# version
|
||||
|
||||
def clean_item(item: dict) -> dict:
|
||||
''' remove keys/values i don't care to keep in git '''
|
||||
item = {
|
||||
k:v for k,v in item.items() if k in [
|
||||
# "content_type",
|
||||
"description", # not used
|
||||
# "favicon",
|
||||
# "favicon_data_uri", # embedded favicon
|
||||
# "hubs", # PubSub hubs
|
||||
"is_podcast", # used by <hosts/common/feeds.nix>
|
||||
# "is_push",
|
||||
# "last_updated",
|
||||
# "self_url",
|
||||
"site_name", # not used
|
||||
"site_url", # not used
|
||||
"title", # used by <hosts/common/feeds.nix> (and others)
|
||||
"url", # used by <hosts/common/feeds.nix> (and many others)
|
||||
"velocity", # used by <hosts/common/feeds.nix>
|
||||
# "version",
|
||||
] and item[k] is not None
|
||||
}
|
||||
# clean up characters for better printability
|
||||
for k in "title", "site_name", "description":
|
||||
if k not in item: continue
|
||||
item[k] = item[k] \
|
||||
.replace("\u2013", "-") \
|
||||
.replace("\u2019", "'") \
|
||||
def __post_init__(self) -> None:
|
||||
def clean(value: str | None) -> str | None:
|
||||
replacements = {
|
||||
"\u2013": "-",
|
||||
"\u2019": "'",
|
||||
}
|
||||
|
||||
return item
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
def try_load_existing_feed(path_: str) -> dict:
|
||||
for from_, to in replacements.items():
|
||||
value = value.replace(from_, to)
|
||||
return value
|
||||
|
||||
# clean up characters for better printability
|
||||
self.title = clean(self.title)
|
||||
self.site_name = clean(self.site_name)
|
||||
self.description = clean(self.description)
|
||||
|
||||
def from_dict(d: dict[str, Any]) -> 'Self':
|
||||
"""
|
||||
populate this feed from a dict,
|
||||
setting non-provided fields to None,
|
||||
and ignoring and extra fields
|
||||
"""
|
||||
return Feed(
|
||||
description = d.get("description"),
|
||||
is_podcast = d.get("is_podcast"),
|
||||
site_name = d.get("site_name"),
|
||||
site_url = d.get("site_url"),
|
||||
title = d.get("title"),
|
||||
url = d.get("url"),
|
||||
velocity = d.get("velocity"),
|
||||
)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return dict(
|
||||
description=self.description,
|
||||
is_podcast=self.is_podcast,
|
||||
site_name=self.site_name,
|
||||
site_url=self.site_url,
|
||||
title=self.title,
|
||||
url=self.url,
|
||||
velocity=self.velocity,
|
||||
)
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(self.to_dict(), sort_keys=True, indent=2)
|
||||
|
||||
class Locator:
|
||||
def __init__(self, url: str, prefer_podcast: bool) -> None:
|
||||
self.url = url
|
||||
self.prefer_podcast = prefer_podcast
|
||||
self.feeds = []
|
||||
|
||||
def locate_best(self, feeds: list[Feed] | None=None) -> Feed | None:
|
||||
if feeds is None:
|
||||
feeds = list(self.locate_all())
|
||||
|
||||
feeds = sorted(feeds, key=lambda f: (
|
||||
(f.is_podcast == self.prefer_podcast),
|
||||
-f.velocity if f.velocity is not None else 0, #< prefer higher-velocity sources
|
||||
f.url, #< prefer shorter URLs
|
||||
len(f.title) if f.title is not None else 1000, #< prefer shorter titles
|
||||
f, #< tie-breaker
|
||||
))
|
||||
return feeds[0] if len(feeds) else None
|
||||
|
||||
def locate_all(self) -> Iterator[Feed]:
|
||||
uris = [
|
||||
f"https://{self.url}",
|
||||
f"http://{self.url}",
|
||||
f"https://www.{self.url}",
|
||||
f"http://www.{self.url}",
|
||||
]
|
||||
seen = []
|
||||
for uri in uris:
|
||||
for feed in self.locate_feedsearch(uri):
|
||||
if feed not in seen:
|
||||
seen.append(feed)
|
||||
yield feed
|
||||
|
||||
def locate_feedsearch(self, uri: str) -> Iterator[Feed]:
|
||||
scheme, _separator, url = uri.partition("://")
|
||||
assert scheme and url, f"failed to partition ${uri!r}"
|
||||
url = fsc.crawler.coerce_url(url, default_scheme=scheme)
|
||||
print(f"trying {url}")
|
||||
feeds = fsc.search(url, total_timeout=180, request_timeout=90, max_content_length=100*1024*1024)
|
||||
return (Feed.from_dict(i.serialize()) for i in fsc.sort_urls(feeds))
|
||||
|
||||
def try_load_existing_feed(path_: str) -> Feed:
|
||||
try:
|
||||
f = open(path_, "r")
|
||||
except:
|
||||
return {}
|
||||
return Feed()
|
||||
else:
|
||||
return json.loads(f.read())
|
||||
return Feed.from_dict(json.loads(f.read()))
|
||||
|
||||
def select_feed(feeds: list[dict], prefer_podcast: bool) -> dict:
|
||||
feeds = sorted(feeds, key=lambda f: (
|
||||
@@ -80,25 +152,17 @@ def main():
|
||||
url, json_path = args.url, args.output
|
||||
|
||||
existing_data = try_load_existing_feed(json_path)
|
||||
prefer_podcast = args.podcast or (existing_data.is_podcast or False)
|
||||
locator = Locator(url, prefer_podcast=prefer_podcast)
|
||||
|
||||
prefer_podcast = args.podcast or existing_data.get("is_podcast", False)
|
||||
|
||||
items = try_scheme(url, "https") \
|
||||
or try_scheme(url, "http") \
|
||||
or try_scheme(f"www.{url}", "https") \
|
||||
or try_scheme(f"www.{url}", "http") \
|
||||
|
||||
# print all results
|
||||
serialized = [item.serialize() for item in items]
|
||||
serialized = [clean_item(s) for s in serialized]
|
||||
for item in serialized:
|
||||
print(json.dumps(item, sort_keys=True, indent=2))
|
||||
all_feeds = list(locator.locate_all())
|
||||
for feed in all_feeds:
|
||||
print(feed.to_json())
|
||||
|
||||
# save the best feed to disk
|
||||
keep = select_feed(serialized, prefer_podcast=prefer_podcast)
|
||||
results = json.dumps(keep, sort_keys=True, indent=2)
|
||||
keep = locator.locate_best(all_feeds)
|
||||
with open(json_path, "w") as out:
|
||||
out.write(results)
|
||||
out.write(keep.to_json())
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
Reference in New Issue
Block a user