feeds/update-feed: refactor
This commit is contained in:
@@ -1,59 +1,131 @@
|
|||||||
#!/usr/bin/env nix-shell
|
#!/usr/bin/env nix-shell
|
||||||
#!nix-shell -i python3 -p feedsearch-crawler -p python3
|
#!nix-shell -i python3 -p feedsearch-crawler -p podcastindex-db -p python3
|
||||||
|
|
||||||
from feedsearch_crawler import search, sort_urls
|
from dataclasses import dataclass
|
||||||
from feedsearch_crawler.crawler import coerce_url
|
from typing import Any, Iterator
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import feedsearch_crawler as fsc
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import sys
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def try_scheme(url: str, scheme: str):
|
@dataclass(order=True)
|
||||||
url = coerce_url(url, default_scheme=scheme)
|
class Feed:
|
||||||
print(f"trying {url}")
|
# content_type
|
||||||
items = search(url, total_timeout=180, request_timeout=90, max_content_length=100*1024*1024)
|
description: str | None = None # not used
|
||||||
return sort_urls(items)
|
# favicon
|
||||||
|
# favicon_data_uri # embedded favicon
|
||||||
|
# hubs # PubSub hubs
|
||||||
|
is_podcast: bool | None = None # used by <hosts/common/feeds.nix>
|
||||||
|
# is_push
|
||||||
|
# last_updated
|
||||||
|
# self_url
|
||||||
|
site_name: str | None = None # not used
|
||||||
|
site_url: str | None = None # not used
|
||||||
|
title: str | None = None # used by <hosts/common/feeds.nix> (and others)
|
||||||
|
url: str | None = None # used by <hosts/common/feeds.nix> (and many others)
|
||||||
|
velocity: float | None = None # used by <hosts/common/feeds.nix>
|
||||||
|
# version
|
||||||
|
|
||||||
def clean_item(item: dict) -> dict:
|
def __post_init__(self) -> None:
|
||||||
''' remove keys/values i don't care to keep in git '''
|
def clean(value: str | None) -> str | None:
|
||||||
item = {
|
replacements = {
|
||||||
k:v for k,v in item.items() if k in [
|
"\u2013": "-",
|
||||||
# "content_type",
|
"\u2019": "'",
|
||||||
"description", # not used
|
|
||||||
# "favicon",
|
|
||||||
# "favicon_data_uri", # embedded favicon
|
|
||||||
# "hubs", # PubSub hubs
|
|
||||||
"is_podcast", # used by <hosts/common/feeds.nix>
|
|
||||||
# "is_push",
|
|
||||||
# "last_updated",
|
|
||||||
# "self_url",
|
|
||||||
"site_name", # not used
|
|
||||||
"site_url", # not used
|
|
||||||
"title", # used by <hosts/common/feeds.nix> (and others)
|
|
||||||
"url", # used by <hosts/common/feeds.nix> (and many others)
|
|
||||||
"velocity", # used by <hosts/common/feeds.nix>
|
|
||||||
# "version",
|
|
||||||
] and item[k] is not None
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
for from_, to in replacements.items():
|
||||||
|
value = value.replace(from_, to)
|
||||||
|
return value
|
||||||
|
|
||||||
# clean up characters for better printability
|
# clean up characters for better printability
|
||||||
for k in "title", "site_name", "description":
|
self.title = clean(self.title)
|
||||||
if k not in item: continue
|
self.site_name = clean(self.site_name)
|
||||||
item[k] = item[k] \
|
self.description = clean(self.description)
|
||||||
.replace("\u2013", "-") \
|
|
||||||
.replace("\u2019", "'") \
|
|
||||||
|
|
||||||
return item
|
def from_dict(d: dict[str, Any]) -> 'Self':
|
||||||
|
"""
|
||||||
|
populate this feed from a dict,
|
||||||
|
setting non-provided fields to None,
|
||||||
|
and ignoring and extra fields
|
||||||
|
"""
|
||||||
|
return Feed(
|
||||||
|
description = d.get("description"),
|
||||||
|
is_podcast = d.get("is_podcast"),
|
||||||
|
site_name = d.get("site_name"),
|
||||||
|
site_url = d.get("site_url"),
|
||||||
|
title = d.get("title"),
|
||||||
|
url = d.get("url"),
|
||||||
|
velocity = d.get("velocity"),
|
||||||
|
)
|
||||||
|
|
||||||
def try_load_existing_feed(path_: str) -> dict:
|
def to_dict(self) -> dict[str, Any]:
|
||||||
|
return dict(
|
||||||
|
description=self.description,
|
||||||
|
is_podcast=self.is_podcast,
|
||||||
|
site_name=self.site_name,
|
||||||
|
site_url=self.site_url,
|
||||||
|
title=self.title,
|
||||||
|
url=self.url,
|
||||||
|
velocity=self.velocity,
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_json(self) -> str:
|
||||||
|
return json.dumps(self.to_dict(), sort_keys=True, indent=2)
|
||||||
|
|
||||||
|
class Locator:
|
||||||
|
def __init__(self, url: str, prefer_podcast: bool) -> None:
|
||||||
|
self.url = url
|
||||||
|
self.prefer_podcast = prefer_podcast
|
||||||
|
self.feeds = []
|
||||||
|
|
||||||
|
def locate_best(self, feeds: list[Feed] | None=None) -> Feed | None:
|
||||||
|
if feeds is None:
|
||||||
|
feeds = list(self.locate_all())
|
||||||
|
|
||||||
|
feeds = sorted(feeds, key=lambda f: (
|
||||||
|
(f.is_podcast == self.prefer_podcast),
|
||||||
|
-f.velocity if f.velocity is not None else 0, #< prefer higher-velocity sources
|
||||||
|
f.url, #< prefer shorter URLs
|
||||||
|
len(f.title) if f.title is not None else 1000, #< prefer shorter titles
|
||||||
|
f, #< tie-breaker
|
||||||
|
))
|
||||||
|
return feeds[0] if len(feeds) else None
|
||||||
|
|
||||||
|
def locate_all(self) -> Iterator[Feed]:
|
||||||
|
uris = [
|
||||||
|
f"https://{self.url}",
|
||||||
|
f"http://{self.url}",
|
||||||
|
f"https://www.{self.url}",
|
||||||
|
f"http://www.{self.url}",
|
||||||
|
]
|
||||||
|
seen = []
|
||||||
|
for uri in uris:
|
||||||
|
for feed in self.locate_feedsearch(uri):
|
||||||
|
if feed not in seen:
|
||||||
|
seen.append(feed)
|
||||||
|
yield feed
|
||||||
|
|
||||||
|
def locate_feedsearch(self, uri: str) -> Iterator[Feed]:
|
||||||
|
scheme, _separator, url = uri.partition("://")
|
||||||
|
assert scheme and url, f"failed to partition ${uri!r}"
|
||||||
|
url = fsc.crawler.coerce_url(url, default_scheme=scheme)
|
||||||
|
print(f"trying {url}")
|
||||||
|
feeds = fsc.search(url, total_timeout=180, request_timeout=90, max_content_length=100*1024*1024)
|
||||||
|
return (Feed.from_dict(i.serialize()) for i in fsc.sort_urls(feeds))
|
||||||
|
|
||||||
|
def try_load_existing_feed(path_: str) -> Feed:
|
||||||
try:
|
try:
|
||||||
f = open(path_, "r")
|
f = open(path_, "r")
|
||||||
except:
|
except:
|
||||||
return {}
|
return Feed()
|
||||||
else:
|
else:
|
||||||
return json.loads(f.read())
|
return Feed.from_dict(json.loads(f.read()))
|
||||||
|
|
||||||
def select_feed(feeds: list[dict], prefer_podcast: bool) -> dict:
|
def select_feed(feeds: list[dict], prefer_podcast: bool) -> dict:
|
||||||
feeds = sorted(feeds, key=lambda f: (
|
feeds = sorted(feeds, key=lambda f: (
|
||||||
@@ -80,25 +152,17 @@ def main():
|
|||||||
url, json_path = args.url, args.output
|
url, json_path = args.url, args.output
|
||||||
|
|
||||||
existing_data = try_load_existing_feed(json_path)
|
existing_data = try_load_existing_feed(json_path)
|
||||||
|
prefer_podcast = args.podcast or (existing_data.is_podcast or False)
|
||||||
|
locator = Locator(url, prefer_podcast=prefer_podcast)
|
||||||
|
|
||||||
prefer_podcast = args.podcast or existing_data.get("is_podcast", False)
|
all_feeds = list(locator.locate_all())
|
||||||
|
for feed in all_feeds:
|
||||||
items = try_scheme(url, "https") \
|
print(feed.to_json())
|
||||||
or try_scheme(url, "http") \
|
|
||||||
or try_scheme(f"www.{url}", "https") \
|
|
||||||
or try_scheme(f"www.{url}", "http") \
|
|
||||||
|
|
||||||
# print all results
|
|
||||||
serialized = [item.serialize() for item in items]
|
|
||||||
serialized = [clean_item(s) for s in serialized]
|
|
||||||
for item in serialized:
|
|
||||||
print(json.dumps(item, sort_keys=True, indent=2))
|
|
||||||
|
|
||||||
# save the best feed to disk
|
# save the best feed to disk
|
||||||
keep = select_feed(serialized, prefer_podcast=prefer_podcast)
|
keep = locator.locate_best(all_feeds)
|
||||||
results = json.dumps(keep, sort_keys=True, indent=2)
|
|
||||||
with open(json_path, "w") as out:
|
with open(json_path, "w") as out:
|
||||||
out.write(results)
|
out.write(keep.to_json())
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
Reference in New Issue
Block a user