Merge pull request #244656 from jokatzke/trafilatura

python3Packages.trafilatura: init at 1.6.3
This commit is contained in:
Sandro 2024-02-07 11:45:55 +01:00 committed by GitHub
commit bc48aabfa7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 279 additions and 0 deletions

View File

@ -9162,6 +9162,12 @@
fingerprint = "7249 70E6 A661 D84E 8B47 678A 0590 93B1 A278 BCD0";
}];
};
jokatzke = {
email = "jokatzke@fastmail.com";
github = "jokatzke";
githubId = 46931073;
name = "Jonas Katzke";
};
joko = {
email = "ioannis.koutras@gmail.com";
github = "jokogr";

View File

@ -0,0 +1,54 @@
{ lib
, buildPythonPackage
, fetchPypi
, langcodes
, pytestCheckHook
, tld
, urllib3
, pythonOlder
}:
buildPythonPackage rec {
pname = "courlan";
version = "0.9.5";
format = "setuptools";
disabled = pythonOlder "3.6";
src = fetchPypi {
inherit pname version;
hash = "sha256-ONw1suO/H11RbQDVGsEuveVD40F8a+b2oic8D8W1s1M=";
};
propagatedBuildInputs = [
langcodes
tld
urllib3
];
nativeCheckInputs = [
pytestCheckHook
];
# disable tests that require an internet connection
disabledTests = [
"test_urlcheck"
];
# nixify path to the courlan binary in the test suite
postPatch = ''
substituteInPlace tests/unit_tests.py \
--replace "\"courlan --help\"" "\"$out/bin/courlan --help\"" \
--replace "courlan_bin = \"courlan\"" "courlan_bin = \"$out/bin/courlan\""
'';
pythonImportsCheck = [ "courlan" ];
meta = with lib; {
description = "Clean, filter and sample URLs to optimize data collection";
homepage = "https://github.com/adbar/courlan";
changelog = "https://github.com/adbar/courlan/blob/v${version}/HISTORY.md";
license = licenses.gpl3Plus;
maintainers = with maintainers; [ jokatzke ];
};
}

View File

@ -0,0 +1,56 @@
{ lib
, buildPythonPackage
, fetchPypi
, pythonOlder
, charset-normalizer
, dateparser
, lxml
, pytestCheckHook
, python-dateutil
, urllib3
, backports-datetime-fromisoformat
}:
buildPythonPackage rec {
pname = "htmldate";
version = "1.6.0";
format = "setuptools";
disabled = pythonOlder "3.6";
src = fetchPypi {
inherit pname version;
hash = "sha256-WCfI9iahaACinlfoGIo9MtCwjKTHvWYlN7c7u/IsRaY=";
};
propagatedBuildInputs = [
charset-normalizer
dateparser
lxml
python-dateutil
urllib3
] ++ lib.optionals (pythonOlder "3.7") [
backports-datetime-fromisoformat
];
nativeCheckInputs = [
pytestCheckHook
];
# disable tests that require an internet connection
disabledTests = [
"test_input"
"test_cli"
"test_download"
];
pythonImportsCheck = [ "htmldate" ];
meta = with lib; {
description = "Fast and robust extraction of original and updated publication dates from URLs and web pages";
homepage = "https://htmldate.readthedocs.io";
changelog = "https://github.com/adbar/htmldate/blob/v${version}/CHANGELOG.md";
license = licenses.gpl3Plus;
maintainers = with maintainers; [ jokatzke ];
};
}

View File

@ -0,0 +1,43 @@
{ lib
, buildPythonPackage
, fetchFromGitHub
, pytestCheckHook
, lxml
}:
buildPythonPackage rec {
pname = "justext";
version = "3.0.0";
format = "setuptools";
src = fetchFromGitHub {
owner = "miso-belica";
repo = "jusText";
rev = "refs/tags/v${version}";
hash = "sha256-WNxDoM5666tEHS9pMl5dOoig4S7dSYaCLZq71tehWqw=";
};
propagatedBuildInputs = [
lxml
];
nativeCheckInputs = [
pytestCheckHook
];
# patch out coverage report
postPatch = ''
substituteInPlace setup.cfg \
--replace " --cov=justext --cov-report=term-missing --no-cov-on-fail" ""
'';
pythonImportsCheck = [ "justext" ];
meta = with lib; {
description = "Heuristic based boilerplate removal tool";
homepage = "https://github.com/miso-belica/jusText";
changelog = "https://github.com/miso-belica/jusText/blob/v${version}/CHANGELOG.rst";
license = licenses.bsd2;
maintainers = with maintainers; [ jokatzke ];
};
}

View File

@ -0,0 +1,43 @@
{ lib
, buildPythonPackage
, fetchPypi
, pythonOlder
, numpy
, pytestCheckHook
}:
buildPythonPackage rec {
pname = "py3langid";
version = "0.2.2";
format = "setuptools";
disabled = pythonOlder "3.6";
src = fetchPypi {
inherit pname version;
hash = "sha256-tN4B2tfnAfKdIWoJNeheCWzIZ1kD0j6oRFsrtfCQuW8=";
};
propagatedBuildInputs = [
numpy
];
nativeCheckInputs = [
pytestCheckHook
];
# nixify path to the courlan binary in the test suite
postPatch = ''
substituteInPlace tests/test_langid.py --replace "'langid'" "'$out/bin/langid'"
'';
pythonImportsCheck = [ "py3langid" ];
meta = with lib; {
description = "Fork of the language identification tool langid.py, featuring a modernized codebase and faster execution times";
homepage = "https://github.com/adbar/py3langid";
changelog = "https://github.com/adbar/py3langid/blob/v${version}/HISTORY.rst";
license = licenses.bsd3;
maintainers = with maintainers; [ jokatzke ];
};
}

View File

@ -0,0 +1,67 @@
{ lib
, buildPythonPackage
, fetchPypi
, pytestCheckHook
, pythonOlder
, certifi
, charset-normalizer
, courlan
, htmldate
, justext
, lxml
, urllib3
}:
buildPythonPackage rec {
pname = "trafilatura";
version = "1.6.3";
format = "setuptools";
disabled = pythonOlder "3.6";
src = fetchPypi {
inherit pname version;
hash = "sha256-Zx3W4AAOEBxLzo1w9ECLy3n8vyJ17iVZHv4z4sihYA0=";
};
propagatedBuildInputs = [
certifi
charset-normalizer
courlan
htmldate
justext
lxml
urllib3
];
nativeCheckInputs = [ pytestCheckHook ];
# disable tests that require an internet connection
disabledTests = [
"test_download"
"test_fetch"
"test_redirection"
"test_meta_redirections"
"test_crawl_page"
"test_whole"
"test_probing"
"test_cli_pipeline"
];
# patch out gui cli because it is not supported in this packaging
# nixify path to the trafilatura binary in the test suite
postPatch = ''
substituteInPlace setup.py --replace '"trafilatura_gui=trafilatura.gui:main",' ""
substituteInPlace tests/cli_tests.py --replace "trafilatura_bin = 'trafilatura'" "trafilatura_bin = '$out/bin/trafilatura'"
'';
pythonImportsCheck = [ "trafilatura" ];
meta = with lib; {
description = "Python package and command-line tool designed to gather text on the Web";
homepage = "https://trafilatura.readthedocs.io";
changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md";
license = licenses.gpl3Plus;
maintainers = with maintainers; [ jokatzke ];
};
}

View File

@ -2397,6 +2397,8 @@ self: super: with self; {
qemu = pkgs.qemu;
};
courlan = callPackage ../development/python-modules/courlan { };
cov-core = callPackage ../development/python-modules/cov-core { };
coverage = callPackage ../development/python-modules/coverage { };
@ -5280,6 +5282,8 @@ self: super: with self; {
html5-parser = callPackage ../development/python-modules/html5-parser { };
htmldate = callPackage ../development/python-modules/htmldate { };
htmllaundry = callPackage ../development/python-modules/htmllaundry { };
htmllistparse = callPackage ../development/python-modules/htmllistparse { };
@ -6057,6 +6061,8 @@ self: super: with self; {
justbytes = callPackage ../development/python-modules/justbytes { };
justext = callPackage ../development/python-modules/justext { };
justnimbus = callPackage ../development/python-modules/justnimbus { };
jwcrypto = callPackage ../development/python-modules/jwcrypto { };
@ -10085,6 +10091,8 @@ self: super: with self; {
py3exiv2 = callPackage ../development/python-modules/py3exiv2 { };
py3langid = callPackage ../development/python-modules/py3langid { };
py3nvml = callPackage ../development/python-modules/py3nvml { };
py3rijndael = callPackage ../development/python-modules/py3rijndael { };
@ -14668,6 +14676,8 @@ self: super: with self; {
trackpy = callPackage ../development/python-modules/trackpy { };
trafilatura = callPackage ../development/python-modules/trafilatura { };
trailrunner = callPackage ../development/python-modules/trailrunner {};
trainer = callPackage ../development/python-modules/trainer {};