Merge pull request #214454 from winterqt/prefetch-npm-deps-reproducibility

prefetch-npm-deps: fix reproducibility
This commit is contained in:
Lily Foster 2023-05-04 19:15:59 -04:00 committed by GitHub
commit 6a6f328ef5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 469 additions and 98 deletions

View File

@ -56,6 +56,9 @@ npmConfigHook() {
exit 1
fi
export CACHE_MAP_PATH="$TMP/MEOW"
@prefetchNpmDeps@ --map-cache
@prefetchNpmDeps@ --fixup-lockfile "$srcLockfile"
local cachePath
@ -109,6 +112,9 @@ npmConfigHook() {
patchShebangs node_modules
rm "$CACHE_MAP_PATH"
unset CACHE_MAP_PATH
echo "Finished npmConfigHook"
}

View File

@ -305,6 +305,7 @@ dependencies = [
"tempfile",
"ureq",
"url",
"walkdir",
]
[[package]]
@ -400,6 +401,15 @@ version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09"
[[package]]
name = "same-file"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
dependencies = [
"winapi-util",
]
[[package]]
name = "scopeguard"
version = "1.1.0"
@ -583,6 +593,17 @@ version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "walkdir"
version = "2.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
dependencies = [
"same-file",
"winapi",
"winapi-util",
]
[[package]]
name = "wasm-bindgen"
version = "0.2.82"
@ -682,6 +703,15 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-util"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
dependencies = [
"winapi",
]
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"

View File

@ -17,3 +17,4 @@ sha2 = "0.10.6"
tempfile = "3.3.0"
ureq = { version = "2.5.0" }
url = { version = "2.3.1", features = ["serde"] }
walkdir = "2.3.2"

View File

@ -1,5 +1,5 @@
use digest::{Digest, Update};
use serde::Serialize;
use serde::{Deserialize, Serialize};
use sha1::Sha1;
use sha2::{Sha256, Sha512};
use std::{
@ -9,24 +9,24 @@ use std::{
};
use url::Url;
#[derive(Serialize)]
struct Key {
key: String,
integrity: String,
time: u8,
size: usize,
metadata: Metadata,
#[derive(Serialize, Deserialize)]
pub(super) struct Key {
pub(super) key: String,
pub(super) integrity: String,
pub(super) time: u8,
pub(super) size: usize,
pub(super) metadata: Metadata,
}
#[derive(Serialize)]
struct Metadata {
url: Url,
options: Options,
#[derive(Serialize, Deserialize)]
pub(super) struct Metadata {
pub(super) url: Url,
pub(super) options: Options,
}
#[derive(Serialize)]
struct Options {
compress: bool,
#[derive(Serialize, Deserialize)]
pub(super) struct Options {
pub(super) compress: bool,
}
pub struct Cache(PathBuf);

View File

@ -1,67 +1,176 @@
#![warn(clippy::pedantic)]
use crate::cacache::Cache;
use anyhow::anyhow;
use crate::cacache::{Cache, Key};
use anyhow::{anyhow, bail};
use rayon::prelude::*;
use serde_json::{Map, Value};
use std::{
collections::HashMap,
env, fs,
path::Path,
path::{Path, PathBuf},
process::{self, Command},
};
use tempfile::tempdir;
use url::Url;
use walkdir::WalkDir;
mod cacache;
mod parse;
/// `fixup_lockfile` removes the `integrity` field from Git dependencies.
fn cache_map_path() -> Option<PathBuf> {
env::var_os("CACHE_MAP_PATH").map(PathBuf::from)
}
/// `fixup_lockfile` rewrites `integrity` hashes to match cache and removes the `integrity` field from Git dependencies.
///
/// Sometimes npm has multiple instances of a given `resolved` URL that have different types of `integrity` hashes (e.g. SHA-1
/// and SHA-512) in the lockfile. Given we only cache one version of these, the `integrity` field must be normalized to the hash
/// we cache as (which is the strongest available one).
///
/// Git dependencies from specific providers can be retrieved from those providers' automatic tarball features.
/// When these dependencies are specified with a commit identifier, npm generates a tarball, and inserts the integrity hash of that
/// tarball into the lockfile.
///
/// Thus, we remove this hash, to replace it with our own determinstic copies of dependencies from hosted Git providers.
fn fixup_lockfile(mut lock: Map<String, Value>) -> anyhow::Result<Option<Map<String, Value>>> {
if lock
///
/// If no fixups were performed, `None` is returned and the lockfile structure should be left as-is. If fixups were performed, the
/// `dependencies` key in v2 lockfiles designed for backwards compatibility with v1 parsers is removed because of inconsistent data.
fn fixup_lockfile(
mut lock: Map<String, Value>,
cache: &Option<HashMap<String, String>>,
) -> anyhow::Result<Option<Map<String, Value>>> {
let mut fixed = false;
match lock
.get("lockfileVersion")
.ok_or_else(|| anyhow!("couldn't get lockfile version"))?
.as_i64()
.ok_or_else(|| anyhow!("lockfile version isn't an int"))?
< 2
{
return Ok(None);
}
1 => fixup_v1_deps(
lock.get_mut("dependencies")
.unwrap()
.as_object_mut()
.unwrap(),
cache,
&mut fixed,
),
2 | 3 => {
for package in lock
.get_mut("packages")
.ok_or_else(|| anyhow!("couldn't get packages"))?
.as_object_mut()
.ok_or_else(|| anyhow!("packages isn't a map"))?
.values_mut()
{
if let Some(Value::String(resolved)) = package.get("resolved") {
if let Some(Value::String(integrity)) = package.get("integrity") {
if resolved.starts_with("git+ssh://") {
fixed = true;
let mut fixed = false;
package
.as_object_mut()
.ok_or_else(|| anyhow!("package isn't a map"))?
.remove("integrity");
} else if let Some(cache_hashes) = cache {
let cache_hash = cache_hashes
.get(resolved)
.expect("dependency should have a hash");
for package in lock
.get_mut("packages")
.ok_or_else(|| anyhow!("couldn't get packages"))?
.as_object_mut()
.ok_or_else(|| anyhow!("packages isn't a map"))?
.values_mut()
{
if let Some(Value::String(resolved)) = package.get("resolved") {
if resolved.starts_with("git+ssh://") && package.get("integrity").is_some() {
fixed = true;
if integrity != cache_hash {
fixed = true;
package
.as_object_mut()
.ok_or_else(|| anyhow!("package isn't a map"))?
.remove("integrity");
*package
.as_object_mut()
.ok_or_else(|| anyhow!("package isn't a map"))?
.get_mut("integrity")
.unwrap() = Value::String(cache_hash.clone());
}
}
}
}
}
if fixed {
lock.remove("dependencies");
}
}
v => bail!("unsupported lockfile version {v}"),
}
if fixed {
lock.remove("dependencies");
Ok(Some(lock))
} else {
Ok(None)
}
}
// Recursive helper to fixup v1 lockfile deps
fn fixup_v1_deps(
dependencies: &mut serde_json::Map<String, Value>,
cache: &Option<HashMap<String, String>>,
fixed: &mut bool,
) {
for dep in dependencies.values_mut() {
if let Some(Value::String(resolved)) = dep
.as_object()
.expect("v1 dep must be object")
.get("resolved")
{
if let Some(Value::String(integrity)) = dep
.as_object()
.expect("v1 dep must be object")
.get("integrity")
{
if resolved.starts_with("git+ssh://") {
*fixed = true;
dep.as_object_mut()
.expect("v1 dep must be object")
.remove("integrity");
} else if let Some(cache_hashes) = cache {
let cache_hash = cache_hashes
.get(resolved)
.expect("dependency should have a hash");
if integrity != cache_hash {
*fixed = true;
*dep.as_object_mut()
.expect("v1 dep must be object")
.get_mut("integrity")
.unwrap() = Value::String(cache_hash.clone());
}
}
}
}
if let Some(Value::Object(more_deps)) = dep.as_object_mut().unwrap().get_mut("dependencies")
{
fixup_v1_deps(more_deps, cache, fixed);
}
}
}
fn map_cache() -> anyhow::Result<HashMap<Url, String>> {
let mut hashes = HashMap::new();
let content_path = Path::new(&env::var_os("npmDeps").unwrap()).join("_cacache/index-v5");
for entry in WalkDir::new(content_path) {
let entry = entry?;
if entry.file_type().is_file() {
let content = fs::read_to_string(entry.path())?;
let key: Key = serde_json::from_str(content.split_ascii_whitespace().nth(1).unwrap())?;
hashes.insert(key.metadata.url, key.integrity);
}
}
Ok(hashes)
}
fn main() -> anyhow::Result<()> {
let args = env::args().collect::<Vec<_>>();
@ -76,12 +185,25 @@ fn main() -> anyhow::Result<()> {
if args[1] == "--fixup-lockfile" {
let lock = serde_json::from_str(&fs::read_to_string(&args[2])?)?;
if let Some(fixed) = fixup_lockfile(lock)? {
let cache = cache_map_path()
.map(|map_path| Ok::<_, anyhow::Error>(serde_json::from_slice(&fs::read(map_path)?)?))
.transpose()?;
if let Some(fixed) = fixup_lockfile(lock, &cache)? {
println!("Fixing lockfile");
fs::write(&args[2], serde_json::to_string(&fixed)?)?;
}
return Ok(());
} else if args[1] == "--map-cache" {
let map = map_cache()?;
fs::write(
cache_map_path().expect("CACHE_MAP_PATH environment variable must be set"),
serde_json::to_string(&map)?,
)?;
return Ok(());
}
@ -105,7 +227,7 @@ fn main() -> anyhow::Result<()> {
eprintln!("{}", package.name);
let tarball = package.tarball()?;
let integrity = package.integrity();
let integrity = package.integrity().map(ToString::to_string);
cache
.put(
@ -133,6 +255,8 @@ fn main() -> anyhow::Result<()> {
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use super::fixup_lockfile;
use serde_json::json;
@ -147,12 +271,20 @@ mod tests {
},
"foo": {
"resolved": "https://github.com/NixOS/nixpkgs",
"integrity": "aaa"
"integrity": "sha1-aaa"
},
"bar": {
"resolved": "git+ssh://git@github.com/NixOS/nixpkgs.git",
"integrity": "bbb"
}
"integrity": "sha512-aaa"
},
"foo-bad": {
"resolved": "foo",
"integrity": "sha1-foo"
},
"foo-good": {
"resolved": "foo",
"integrity": "sha512-foo"
},
}
});
@ -165,22 +297,112 @@ mod tests {
},
"foo": {
"resolved": "https://github.com/NixOS/nixpkgs",
"integrity": "aaa"
"integrity": ""
},
"bar": {
"resolved": "git+ssh://git@github.com/NixOS/nixpkgs.git",
}
},
"foo-bad": {
"resolved": "foo",
"integrity": "sha512-foo"
},
"foo-good": {
"resolved": "foo",
"integrity": "sha512-foo"
},
}
});
let mut hashes = HashMap::new();
hashes.insert(
String::from("https://github.com/NixOS/nixpkgs"),
String::new(),
);
hashes.insert(
String::from("git+ssh://git@github.com/NixOS/nixpkgs.git"),
String::new(),
);
hashes.insert(String::from("foo"), String::from("sha512-foo"));
assert_eq!(
fixup_lockfile(input.as_object().unwrap().clone())?,
fixup_lockfile(input.as_object().unwrap().clone(), &Some(hashes))?,
Some(expected.as_object().unwrap().clone())
);
Ok(())
}
#[test]
fn lockfile_v1_fixup() -> anyhow::Result<()> {
let input = json!({
"lockfileVersion": 1,
"name": "foo",
"dependencies": {
"foo": {
"resolved": "https://github.com/NixOS/nixpkgs",
"integrity": "sha512-aaa"
},
"foo-good": {
"resolved": "foo",
"integrity": "sha512-foo"
},
"bar": {
"resolved": "git+ssh://git@github.com/NixOS/nixpkgs.git",
"integrity": "sha512-bbb",
"dependencies": {
"foo-bad": {
"resolved": "foo",
"integrity": "sha1-foo"
},
},
},
}
});
let expected = json!({
"lockfileVersion": 1,
"name": "foo",
"dependencies": {
"foo": {
"resolved": "https://github.com/NixOS/nixpkgs",
"integrity": ""
},
"foo-good": {
"resolved": "foo",
"integrity": "sha512-foo"
},
"bar": {
"resolved": "git+ssh://git@github.com/NixOS/nixpkgs.git",
"dependencies": {
"foo-bad": {
"resolved": "foo",
"integrity": "sha512-foo"
},
},
},
}
});
let mut hashes = HashMap::new();
hashes.insert(
String::from("https://github.com/NixOS/nixpkgs"),
String::new(),
);
hashes.insert(
String::from("git+ssh://git@github.com/NixOS/nixpkgs.git"),
String::new(),
);
hashes.insert(String::from("foo"), String::from("sha512-foo"));
assert_eq!(
fixup_lockfile(json!({"lockfileVersion": 1}).as_object().unwrap().clone())?,
None
fixup_lockfile(input.as_object().unwrap().clone(), &Some(hashes))?,
Some(expected.as_object().unwrap().clone())
);
Ok(())

View File

@ -1,7 +1,14 @@
use anyhow::{bail, Context};
use anyhow::{anyhow, bail, Context};
use rayon::slice::ParallelSliceMut;
use serde::Deserialize;
use std::{collections::HashMap, fmt};
use serde::{
de::{self, Visitor},
Deserialize, Deserializer,
};
use std::{
cmp::Ordering,
collections::{HashMap, HashSet},
fmt,
};
use url::Url;
pub(super) fn packages(content: &str) -> anyhow::Result<Vec<Package>> {
@ -33,6 +40,13 @@ pub(super) fn packages(content: &str) -> anyhow::Result<Vec<Package>> {
x.resolved
.partial_cmp(&y.resolved)
.expect("resolved should be comparable")
.then(
// v1 lockfiles can contain multiple references to the same version of a package, with
// different integrity values (e.g. a SHA-1 and a SHA-512 in one, but just a SHA-512 in another)
y.integrity
.partial_cmp(&x.integrity)
.expect("integrity should be comparable"),
)
});
packages.dedup_by(|x, y| x.resolved == y.resolved);
@ -54,7 +68,7 @@ struct OldPackage {
#[serde(default)]
bundled: bool,
resolved: Option<UrlOrString>,
integrity: Option<String>,
integrity: Option<HashCollection>,
dependencies: Option<HashMap<String, OldPackage>>,
}
@ -63,7 +77,7 @@ pub(super) struct Package {
#[serde(default)]
pub(super) name: Option<String>,
pub(super) resolved: Option<UrlOrString>,
pub(super) integrity: Option<String>,
pub(super) integrity: Option<HashCollection>,
}
#[derive(Debug, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
@ -82,6 +96,110 @@ impl fmt::Display for UrlOrString {
}
}
#[derive(Debug, PartialEq, Eq)]
pub struct HashCollection(HashSet<Hash>);
impl HashCollection {
pub fn from_str(s: impl AsRef<str>) -> anyhow::Result<HashCollection> {
let hashes = s
.as_ref()
.split_ascii_whitespace()
.map(Hash::new)
.collect::<anyhow::Result<_>>()?;
Ok(HashCollection(hashes))
}
pub fn into_best(self) -> Option<Hash> {
self.0.into_iter().max()
}
}
impl PartialOrd for HashCollection {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
let lhs = self.0.iter().max()?;
let rhs = other.0.iter().max()?;
lhs.partial_cmp(rhs)
}
}
impl<'de> Deserialize<'de> for HashCollection {
fn deserialize<D>(deserializer: D) -> Result<HashCollection, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_string(HashCollectionVisitor)
}
}
struct HashCollectionVisitor;
impl<'de> Visitor<'de> for HashCollectionVisitor {
type Value = HashCollection;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a single SRI hash or a collection of them (separated by spaces)")
}
fn visit_str<E>(self, value: &str) -> Result<HashCollection, E>
where
E: de::Error,
{
HashCollection::from_str(value).map_err(E::custom)
}
}
#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Hash)]
pub struct Hash(String);
// Hash algorithms, in ascending preference.
const ALGOS: &[&str] = &["sha1", "sha512"];
impl Hash {
fn new(s: impl AsRef<str>) -> anyhow::Result<Hash> {
let algo = s
.as_ref()
.split_once('-')
.ok_or_else(|| anyhow!("expected SRI hash, got {:?}", s.as_ref()))?
.0;
if ALGOS.iter().any(|&a| algo == a) {
Ok(Hash(s.as_ref().to_string()))
} else {
Err(anyhow!("unknown hash algorithm {algo:?}"))
}
}
pub fn as_str(&self) -> &str {
&self.0
}
}
impl fmt::Display for Hash {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.as_str().fmt(f)
}
}
impl PartialOrd for Hash {
fn partial_cmp(&self, other: &Hash) -> Option<Ordering> {
let lhs = self.0.split_once('-')?.0;
let rhs = other.0.split_once('-')?.0;
ALGOS
.iter()
.position(|&s| lhs == s)?
.partial_cmp(&ALGOS.iter().position(|&s| rhs == s)?)
}
}
impl Ord for Hash {
fn cmp(&self, other: &Hash) -> Ordering {
self.partial_cmp(other).unwrap()
}
}
#[allow(clippy::case_sensitive_file_extension_comparisons)]
fn to_new_packages(
old_packages: HashMap<String, OldPackage>,
@ -149,8 +267,13 @@ fn get_initial_url() -> anyhow::Result<Url> {
#[cfg(test)]
mod tests {
use super::{get_initial_url, to_new_packages, OldPackage, Package, UrlOrString};
use std::collections::HashMap;
use super::{
get_initial_url, to_new_packages, Hash, HashCollection, OldPackage, Package, UrlOrString,
};
use std::{
cmp::Ordering,
collections::{HashMap, HashSet},
};
use url::Url;
#[test]
@ -188,4 +311,23 @@ mod tests {
Ok(())
}
#[test]
fn hash_preference() {
assert_eq!(
Hash(String::from("sha1-foo")).partial_cmp(&Hash(String::from("sha512-foo"))),
Some(Ordering::Less)
);
assert_eq!(
HashCollection({
let mut set = HashSet::new();
set.insert(Hash(String::from("sha512-foo")));
set.insert(Hash(String::from("sha1-bar")));
set
})
.into_best(),
Some(Hash(String::from("sha512-foo")))
);
}
}

View File

@ -9,7 +9,7 @@ use std::{
use tempfile::{tempdir, TempDir};
use url::Url;
mod lock;
pub mod lock;
pub fn lockfile(content: &str, force_git_deps: bool) -> anyhow::Result<Vec<Package>> {
let mut packages = lock::packages(content)
@ -87,7 +87,7 @@ pub struct Package {
#[derive(Debug)]
enum Specifics {
Registry { integrity: String },
Registry { integrity: lock::Hash },
Git { workdir: TempDir },
}
@ -134,11 +134,11 @@ impl Package {
Specifics::Git { workdir }
}
None => Specifics::Registry {
integrity: get_ideal_hash(
&pkg.integrity
.expect("non-git dependencies should have assosciated integrity"),
)?
.to_string(),
integrity: pkg
.integrity
.expect("non-git dependencies should have assosciated integrity")
.into_best()
.expect("non-git dependencies should have non-empty assosciated integrity"),
},
};
@ -181,9 +181,9 @@ impl Package {
}
}
pub fn integrity(&self) -> Option<String> {
pub fn integrity(&self) -> Option<&lock::Hash> {
match &self.specifics {
Specifics::Registry { integrity } => Some(integrity.clone()),
Specifics::Registry { integrity } => Some(integrity),
Specifics::Git { .. } => None,
}
}
@ -304,25 +304,9 @@ fn get_hosted_git_url(url: &Url) -> anyhow::Result<Option<Url>> {
}
}
fn get_ideal_hash(integrity: &str) -> anyhow::Result<&str> {
let split: Vec<_> = integrity.split_ascii_whitespace().collect();
if split.len() == 1 {
Ok(split[0])
} else {
for hash in ["sha512-", "sha1-"] {
if let Some(h) = split.iter().find(|s| s.starts_with(hash)) {
return Ok(h);
}
}
Err(anyhow!("not sure which hash to select out of {split:?}"))
}
}
#[cfg(test)]
mod tests {
use super::{get_hosted_git_url, get_ideal_hash};
use super::get_hosted_git_url;
use url::Url;
#[test]
@ -353,18 +337,4 @@ mod tests {
"GitLab URLs should be marked as invalid (lol)"
);
}
#[test]
fn ideal_hashes() {
for (input, expected) in [
("sha512-foo sha1-bar", Some("sha512-foo")),
("sha1-bar md5-foo", Some("sha1-bar")),
("sha1-bar", Some("sha1-bar")),
("sha512-foo", Some("sha512-foo")),
("foo-bar sha1-bar", Some("sha1-bar")),
("foo-bar baz-foo", None),
] {
assert_eq!(get_ideal_hash(input).ok(), expected);
}
}
}