focus/internals/src/lib/index/content_hash.rs (480 lines of code) (raw):

// Copyright 2022 Twitter, Inc. // SPDX-License-Identifier: Apache-2.0 use std::cell::RefCell; use std::collections::{BTreeSet, HashMap}; use std::fmt::{Display, Write}; use std::hash::Hash; use std::path::Path; use std::path::PathBuf; use std::str::FromStr; use lazy_static::lazy_static; use regex::Regex; use thiserror::Error; use tracing::debug; use tracing::error; use tracing::trace; use tracing::warn; use crate::target::Label; use crate::target::TargetName; use focus_util::paths::is_relevant_to_build_graph; use super::DependencyKey; /// This value is mixed into all content hashes. Update this value when /// content-hashing changes in a backward-incompatible way. const VERSION: usize = 7; /// The hash of a [`DependencyKey`]'s syntactic content. #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct ContentHash(pub(super) git2::Oid); impl From<ContentHash> for git2::Oid { fn from(hash: ContentHash) -> Self { let ContentHash(oid) = hash; oid } } impl Display for ContentHash { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let Self(hash) = self; write!(f, "{}", hash) } } impl FromStr for ContentHash { type Err = git2::Error; fn from_str(s: &str) -> std::result::Result<Self, Self::Err> { let oid = git2::Oid::from_str(s)?; Ok(ContentHash(oid)) } } #[derive(Debug, Default)] pub struct Caches { /// Cache of hashed dependency keys. These are only valid for the provided `repo`/`head_tree`. dependency_key_cache: HashMap<DependencyKey, ContentHash>, /// Cache of hashed tree paths, which should be either: /// /// - Bazel-relevant build files. /// - Directories. /// - Non-existent. /// /// These are only valid for the provided `repo`/`head_tree`. tree_path_cache: HashMap<PathBuf, ContentHash>, /// Cache of parsed load dependencies. The OIDs here are tree OIDs which /// have to be traversed to find any files relevant to the build graph. load_dependencies_cache: HashMap<git2::Oid, BTreeSet<Label>>, /// Cache of dependencies loaded from the `prelude_bazel` file. prelude_deps_cache: Option<BTreeSet<Label>>, } /// Context used to compute a content hash. pub struct HashContext<'a> { /// The Git repository. repo: &'a git2::Repository, /// The tree corresponding to the current working copy. head_tree: &'a git2::Tree<'a>, /// Associated caches. caches: RefCell<Caches>, } impl std::fmt::Debug for HashContext<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let Self { repo, head_tree, caches, } = self; f.debug_struct("HashContext") .field("repo", &repo.path()) .field("head_tree", &head_tree.id()) .field("caches", &caches) .finish() } } impl<'repo> HashContext<'repo> { /// Construct a new hash context from the given repository state. pub fn new(repo: &'repo git2::Repository, head_tree: &'repo git2::Tree) -> Result<Self> { Ok(Self { repo, head_tree, caches: Default::default(), }) } /// Get the underlying repository. pub fn repo(&self) -> &git2::Repository { self.repo } /// Get the underlying head tree. pub fn head_tree(&self) -> &git2::Tree { self.head_tree } } #[derive(Debug, Error)] pub enum Error { #[error("could not read tree entry: {0}")] ReadTreeEntry(#[source] git2::Error), #[error("could not hash object: {0}")] HashObject(#[source] git2::Error), #[error("I/O error: {0}")] Fmt(#[from] std::fmt::Error), #[error("bug: {0}")] Bug(String), } fn clone_git_error(error: &git2::Error) -> git2::Error { git2::Error::new(error.code(), error.class(), error.message()) } impl Clone for Error { fn clone(&self) -> Self { match self { Self::ReadTreeEntry(e) => Self::ReadTreeEntry(clone_git_error(e)), Self::HashObject(e) => Self::HashObject(clone_git_error(e)), Self::Fmt(e) => Self::Fmt(*e), Self::Bug(message) => Self::Bug(message.clone()), } } } pub type Result<T> = std::result::Result<T, Error>; /// Compute a content-addressable hash for the provided [`DependencyKey`] using /// the context in `ctx`. pub fn content_hash(ctx: &HashContext, key: &DependencyKey) -> Result<ContentHash> { let key = key.clone(); content_hash_dependency_key(ctx, key) } fn content_hash_dependency_key(ctx: &HashContext, key: DependencyKey) -> Result<ContentHash> { debug!(?key, "Hashing dependency key"); { let cache = &mut ctx.caches.borrow_mut().dependency_key_cache; if let Some(hash) = cache.get(&key) { return Ok(hash.to_owned()); } } let (kind, maybe_label, values_to_hash) = get_dependencies(ctx, &key)?; let mut buf = String::new(); write!(&mut buf, "DependencyKeyV{VERSION}::{kind}(")?; if let Some(label) = maybe_label { write!(&mut buf, "{label}, ")?; } let hashes = values_to_hash .into_iter() .map(|key_or_hash| match key_or_hash { KeyOrPath::Key(dep_key) => content_hash_dependency_key(ctx, dep_key), KeyOrPath::Path(path) => content_hash_tree_path(ctx, path), }) .collect::<Result<Vec<_>>>()?; for hash in hashes { write!(&mut buf, "{hash}, ")?; } write!(&mut buf, ")")?; let hash = git2::Oid::hash_object(git2::ObjectType::Blob, buf.as_bytes()) .map_err(Error::HashObject)?; let hash = ContentHash(hash); if let Some(old_value) = ctx .caches .borrow_mut() .dependency_key_cache .insert(key.to_owned(), hash.clone()) { if old_value != hash { error!(?key, ?old_value, new_value = ?hash, "Non-deterministic content hashing for dependency key"); } } Ok(hash) } #[derive(Debug)] enum KeyOrPath<'a> { Key(DependencyKey), Path(&'a Path), } fn get_dependencies<'a>( ctx: &HashContext, dep_key: &'a DependencyKey, ) -> Result<(&'static str, Option<&'a Label>, Vec<KeyOrPath<'a>>)> { match dep_key { DependencyKey::BazelPackage( label @ Label { external_repository, path_components, target_name: _, }, ) => { let path: PathBuf = path_components.iter().collect(); let mut dep_keys = vec![DependencyKey::Path(path.clone())]; dep_keys.extend(match external_repository { Some(_) => vec![], None => { let mut loaded_deps = match get_tree_for_path(ctx, &path)? { Some(tree) => find_load_dependencies(ctx, &tree)?, None => Default::default(), }; let prelude_deps = get_prelude_deps(ctx)?; loaded_deps.extend(prelude_deps); loaded_deps .into_iter() .map(DependencyKey::BazelBuildFile) .collect() } }); // Every package has an implicit dependency on the `WORKSPACE` file. dep_keys.push(DependencyKey::BazelBuildFile(Label { external_repository: None, path_components: Vec::new(), target_name: TargetName::Name("WORKSPACE".to_string()), })); Ok(( "BazelPackage", Some(label), dep_keys.into_iter().map(KeyOrPath::Key).collect(), )) } DependencyKey::BazelBuildFile( label @ Label { external_repository, path_components, target_name, }, ) => { let mut dep_keys = match (external_repository, target_name) { (Some(_), _) => Vec::new(), (None, TargetName::Ellipsis) => { return Err(Error::Bug(format!( "Got label referring to a ellipsis, but it should be a BUILD or .bzl file: {:?}", label ))); } (None, TargetName::Name(target_name)) => { let path: PathBuf = { let mut path: PathBuf = path_components.iter().collect(); path.push(target_name); path }; let mut dep_keys = vec![DependencyKey::Path(path.clone())]; let loaded_deps = match ctx.head_tree.get_path(&path) { Ok(tree_entry) => { if is_tree_entry_relevant_to_build_graph(&tree_entry) { extract_load_statements_from_tree_entry(ctx, &tree_entry)? } else { Default::default() } } Err(e) if e.code() == git2::ErrorCode::NotFound => Default::default(), Err(e) => return Err(Error::ReadTreeEntry(e)), }; dep_keys.extend( loaded_deps .into_iter() .map(DependencyKey::BazelBuildFile) .filter(|key| dep_key != key), ); dep_keys } }; // Every `.bzl` file (or similar) has an implicit dependency on the // `WORKSPACE` file. However, the `WORKSPACE` file itself may `load` // `.bzl` files in the repository. To avoid a circular dependency, // use only the textual hash of the WORKSPACE as the dependency key // here. dep_keys.push(DependencyKey::Path("WORKSPACE".into())); Ok(( "BazelBuildFile", Some(label), dep_keys.into_iter().map(KeyOrPath::Key).collect(), )) } DependencyKey::Path(path) => Ok(("Path", None, vec![KeyOrPath::Path(path)])), DependencyKey::DummyForTesting(inner_dep_key) => Ok(( "DummyForTesting", None, vec![KeyOrPath::Key(inner_dep_key.as_ref().clone())], )), } } pub fn get_workspace_deps(ctx: &HashContext) -> Result<BTreeSet<DependencyKey>> { let workspace_key = DependencyKey::BazelBuildFile(Label { external_repository: None, path_components: Vec::new(), target_name: TargetName::Name("WORKSPACE".to_string()), }); let (_, _, deps) = get_dependencies(ctx, &workspace_key)?; Ok(deps .into_iter() .filter_map(|dep_key| match dep_key { KeyOrPath::Key(key) => Some(key), KeyOrPath::Path(_) => None, }) .collect()) } /// Get the dependencies induced by the special /// `tools/build_rules/prelude_bazel` file (if present). See /// https://github.com/bazelbuild/bazel/issues/1674 for discussion on what this /// file is. pub fn get_prelude_deps(ctx: &HashContext) -> Result<BTreeSet<Label>> { if let Some(prelude_deps) = &ctx.caches.borrow().prelude_deps_cache { return Ok(prelude_deps.clone()); } let prelude_dir = ["tools", "build_rules"]; let prelude_file_name = "prelude_bazel"; let prelude_path: PathBuf = prelude_dir.into_iter().chain([prelude_file_name]).collect(); let result = match ctx.head_tree.get_path(&prelude_path) { Ok(tree_entry) => { let mut result = BTreeSet::new(); result.insert(Label { external_repository: None, path_components: prelude_dir.into_iter().map(|s| s.to_string()).collect(), target_name: TargetName::Name(prelude_file_name.to_string()), }); result.extend(extract_load_statements_from_tree_entry(ctx, &tree_entry)?); result } Err(err) if err.code() == git2::ErrorCode::NotFound => Default::default(), Err(err) => return Err(Error::ReadTreeEntry(err)), }; ctx.caches.borrow_mut().prelude_deps_cache = Some(result.clone()); Ok(result) } fn content_hash_tree_path(ctx: &HashContext, path: &Path) -> Result<ContentHash> { if let Some(hash) = ctx.caches.borrow().tree_path_cache.get(path) { return Ok(hash.clone()); } let mut buf = String::new(); write!( &mut buf, "PathBufV{VERSION}({tree_id})", tree_id = get_tree_path_id(ctx.head_tree, path).map_err(Error::ReadTreeEntry)?, )?; let hash = git2::Oid::hash_object(git2::ObjectType::Blob, buf.as_bytes()) .map_err(Error::HashObject)?; let hash = ContentHash(hash); if let Some(old_value) = ctx .caches .borrow_mut() .tree_path_cache .insert(path.to_owned(), hash.clone()) { if old_value != hash { error!(key = ?path, ?old_value, new_value = ?hash, "Non-deterministic content hashing for path"); } } Ok(hash) } fn get_tree_path_id(tree: &git2::Tree, path: &Path) -> std::result::Result<git2::Oid, git2::Error> { if path == Path::new("") { // `get_path` will produce an error if we pass an empty path, so // manually handle that here. Ok(tree.id()) } else { match tree.get_path(path) { Ok(entry) => Ok(entry.id()), Err(err) if err.code() == git2::ErrorCode::NotFound => { // TODO: test this code path Ok(git2::Oid::zero()) } Err(err) => Err(err), } } } fn get_tree_for_path<'repo>( ctx: &HashContext<'repo>, package_path: &Path, ) -> Result<Option<git2::Tree<'repo>>> { if package_path == Path::new("") { Ok(Some(ctx.head_tree.to_owned())) } else { let tree_entry = match ctx.head_tree.get_path(package_path) { Ok(tree_entry) => tree_entry, Err(e) if e.code() == git2::ErrorCode::NotFound => return Ok(None), Err(e) => return Err(Error::ReadTreeEntry(e)), }; let object = tree_entry .to_object(ctx.repo) .map_err(Error::ReadTreeEntry)?; let tree = object.as_tree().map(|tree| tree.to_owned()); Ok(tree) } } fn find_load_dependencies(ctx: &HashContext, tree: &git2::Tree) -> Result<BTreeSet<Label>> { trace!(?tree, "Finding load dependencies"); if let Some(result) = ctx.caches.borrow().load_dependencies_cache.get(&tree.id()) { return Ok(result.clone()); } let mut result = BTreeSet::new(); for tree_entry in tree { if is_tree_entry_relevant_to_build_graph(&tree_entry) { let deps = extract_load_statements_from_tree_entry(ctx, &tree_entry)?; result.extend(deps); } } if let Some(old_value) = ctx .caches .borrow_mut() .load_dependencies_cache .insert(tree.id(), result.clone()) { if old_value != result { error!(key = ?tree.id(), ?old_value, new_value = ?result, "Non-deterministic content hashing for load dependencies"); } } Ok(result) } fn is_tree_entry_relevant_to_build_graph(tree_entry: &git2::TreeEntry) -> bool { match tree_entry.name() { Some(file_name) => is_relevant_to_build_graph(Path::new(file_name)), None => { warn!(name_bytes = ?tree_entry.name_bytes(), "Skipped tree entry with non-UTF-8 name"); false } } } fn extract_load_statements_from_tree_entry( ctx: &HashContext, tree_entry: &git2::TreeEntry, ) -> Result<BTreeSet<Label>> { let object = tree_entry .to_object(ctx.repo) .map_err(Error::ReadTreeEntry)?; let blob = match object.as_blob() { Some(blob) => blob, None => { warn!(file_name = ?tree_entry.name(), "Tree entry was not a blob"); return Ok(Default::default()); } }; let content = match std::str::from_utf8(blob.content()) { Ok(content) => content, Err(e) => { warn!(file_name = ?tree_entry.name(), ?e, "Could not decode non-UTF-8 blob content"); return Ok(Default::default()); } }; let deps = extract_load_statement_package_dependencies(content); Ok(deps) } fn extract_load_statement_package_dependencies(content: &str) -> BTreeSet<Label> { lazy_static! { static ref RE: Regex = Regex::new( r#"(?x) # Literal "load". load \s*? # Open parenthesis. \( \s*? # String literal enclosed in quotes. (?: "( [[:print:]--"]*? )" | '( [[:print:]--']*? )' ) # Either a closing parenthesis or a comma to start the argument list. \s*? (?: , | \) ) "# ) .unwrap(); } let mut result = BTreeSet::new(); for cap in RE.captures_iter(content) { let value = cap.get(1).or_else(|| cap.get(2)).unwrap().as_str(); let label: Label = match value.parse() { Ok(label) => label, Err(e) => { warn!(?e, "Failed to parse label in load statement"); continue; } }; result.insert(label); } result } #[cfg(test)] mod tests { use super::*; #[test] fn test_extract_load_statements() -> Result<()> { let content = r#" load("//foo/bar:baz.bzl") load ( '//foo/qux:qux.bzl' , qux = 'grault') "#; let labels = extract_load_statement_package_dependencies(content); insta::assert_debug_snapshot!(labels, @r###" { Label("//foo/bar:baz.bzl"), Label("//foo/qux:qux.bzl"), } "###); Ok(()) } }