Promote candidate table row families into export

This commit is contained in:
Jan Petykiewicz 2026-04-21 17:56:53 -07:00
commit f7a2342a7c
4 changed files with 370 additions and 10 deletions

View file

@ -61,10 +61,26 @@ pub(crate) struct RuntimeCandidateTableNamedRunScanReport {
pub(crate) files_with_any_numbered_port_runs_count: usize,
pub(crate) files_with_any_numbered_warehouse_runs_count: usize,
pub(crate) files_with_both_numbered_run_families_count: usize,
pub(crate) files_with_port01_11_run_at_45_55_count: usize,
pub(crate) files_with_warehouse01_11_run_at_56_66_count: usize,
pub(crate) port00_warehouse00_row_pair_map_counts: BTreeMap<String, usize>,
pub(crate) port00_warehouse00_row_pair_map_paths: BTreeMap<String, Vec<String>>,
pub(crate) numbered_port_warehouse_trailer_family_map_counts: BTreeMap<String, usize>,
pub(crate) numbered_port_warehouse_trailer_family_map_paths: BTreeMap<String, Vec<String>>,
pub(crate) skipped_file_count: usize,
pub(crate) samples: Vec<RuntimeCandidateTableNamedRunScanSample>,
}
#[derive(Debug, Default)]
struct RuntimeCandidateTableNamedRunAggregates {
files_with_port01_11_run_at_45_55_count: usize,
files_with_warehouse01_11_run_at_56_66_count: usize,
port00_warehouse00_row_pair_map_counts: BTreeMap<String, usize>,
port00_warehouse00_row_pair_map_paths: BTreeMap<String, Vec<String>>,
numbered_port_warehouse_trailer_family_map_counts: BTreeMap<String, usize>,
numbered_port_warehouse_trailer_family_map_paths: BTreeMap<String, Vec<String>>,
}
pub(crate) fn scan_candidate_table_headers(
root_path: &Path,
) -> Result<(), Box<dyn std::error::Error>> {
@ -193,6 +209,7 @@ pub(crate) fn scan_candidate_table_named_runs(
.iter()
.filter(|sample| !sample.port_runs.is_empty() && !sample.warehouse_runs.is_empty())
.count();
let aggregates = build_named_run_aggregates(&samples);
let report = RuntimeCandidateTableNamedRunScanReport {
root_path: root_path.display().to_string(),
@ -201,6 +218,15 @@ pub(crate) fn scan_candidate_table_named_runs(
files_with_any_numbered_port_runs_count,
files_with_any_numbered_warehouse_runs_count,
files_with_both_numbered_run_families_count,
files_with_port01_11_run_at_45_55_count: aggregates.files_with_port01_11_run_at_45_55_count,
files_with_warehouse01_11_run_at_56_66_count: aggregates
.files_with_warehouse01_11_run_at_56_66_count,
port00_warehouse00_row_pair_map_counts: aggregates.port00_warehouse00_row_pair_map_counts,
port00_warehouse00_row_pair_map_paths: aggregates.port00_warehouse00_row_pair_map_paths,
numbered_port_warehouse_trailer_family_map_counts: aggregates
.numbered_port_warehouse_trailer_family_map_counts,
numbered_port_warehouse_trailer_family_map_paths: aggregates
.numbered_port_warehouse_trailer_family_map_paths,
skipped_file_count,
samples,
};
@ -386,3 +412,233 @@ pub(crate) fn load_candidate_table_named_run_scan_sample(
warehouse_runs,
})
}
fn build_named_run_aggregates(
samples: &[RuntimeCandidateTableNamedRunScanSample],
) -> RuntimeCandidateTableNamedRunAggregates {
let mut aggregates = RuntimeCandidateTableNamedRunAggregates::default();
for sample in samples {
if find_named_run(&sample.port_runs, "Port01", "Port11", 45, 55, 11).is_some() {
aggregates.files_with_port01_11_run_at_45_55_count += 1;
}
if find_named_run(
&sample.warehouse_runs,
"Warehouse01",
"Warehouse11",
56,
66,
11,
)
.is_some()
{
aggregates.files_with_warehouse01_11_run_at_56_66_count += 1;
}
if let (Some(port00_run), Some(warehouse00_run)) = (
find_named_run_by_names(&sample.port_runs, "Port00", "Port00", 1),
find_named_run_by_names(&sample.warehouse_runs, "Warehouse00", "Warehouse00", 1),
) {
let row_pair_key =
format!("{}/{}", port00_run.start_index, warehouse00_run.start_index);
*aggregates
.port00_warehouse00_row_pair_map_counts
.entry(row_pair_key.clone())
.or_insert(0) += 1;
aggregates
.port00_warehouse00_row_pair_map_paths
.entry(row_pair_key)
.or_default()
.push(sample.path.clone());
}
if let (Some(port_run), Some(warehouse_run)) = (
find_named_run(&sample.port_runs, "Port01", "Port11", 45, 55, 11),
find_named_run(
&sample.warehouse_runs,
"Warehouse01",
"Warehouse11",
56,
66,
11,
),
) {
let port_signature = trailer_signature(port_run);
let warehouse_signature = trailer_signature(warehouse_run);
let trailer_family_key = if port_signature == warehouse_signature {
port_signature
} else {
format!("port={port_signature} | warehouse={warehouse_signature}")
};
*aggregates
.numbered_port_warehouse_trailer_family_map_counts
.entry(trailer_family_key.clone())
.or_insert(0) += 1;
aggregates
.numbered_port_warehouse_trailer_family_map_paths
.entry(trailer_family_key)
.or_default()
.push(sample.path.clone());
}
}
for paths in aggregates
.port00_warehouse00_row_pair_map_paths
.values_mut()
{
paths.sort();
}
for paths in aggregates
.numbered_port_warehouse_trailer_family_map_paths
.values_mut()
{
paths.sort();
}
aggregates
}
fn find_named_run<'a>(
runs: &'a [RuntimeCandidateTableNamedRun],
first_name: &str,
last_name: &str,
start_index: usize,
end_index: usize,
count: usize,
) -> Option<&'a RuntimeCandidateTableNamedRun> {
runs.iter().find(|run| {
run.first_name == first_name
&& run.last_name == last_name
&& run.count == count
&& run.start_index == start_index
&& run.end_index == end_index
})
}
fn find_named_run_by_names<'a>(
runs: &'a [RuntimeCandidateTableNamedRun],
first_name: &str,
last_name: &str,
count: usize,
) -> Option<&'a RuntimeCandidateTableNamedRun> {
runs.iter().find(|run| {
run.first_name == first_name && run.last_name == last_name && run.count == count
})
}
fn trailer_signature(run: &RuntimeCandidateTableNamedRun) -> String {
run.distinct_trailer_hex_words.join("|")
}
#[cfg(test)]
mod tests {
use super::{
RuntimeCandidateTableNamedRun, RuntimeCandidateTableNamedRunScanSample,
build_named_run_aggregates,
};
#[test]
fn builds_named_run_aggregate_split_fields() {
let samples = vec![
RuntimeCandidateTableNamedRunScanSample {
path: "Louisiana.gmp".to_string(),
profile_family: "unknown".to_string(),
source_kind: "map-fixed-catalog-range".to_string(),
observed_entry_count: 67,
port_runs: vec![
sample_run("Port00", "Port00", 35, 35, 1, vec!["0x00000001"]),
sample_run("Port01", "Port11", 45, 55, 11, vec!["0x00000001"]),
],
warehouse_runs: vec![
sample_run("Warehouse00", "Warehouse00", 43, 43, 1, vec!["0x00000001"]),
sample_run("Warehouse01", "Warehouse11", 56, 66, 11, vec!["0x00000001"]),
],
},
RuntimeCandidateTableNamedRunScanSample {
path: "State of Germany.gmp".to_string(),
profile_family: "unknown".to_string(),
source_kind: "map-fixed-catalog-range".to_string(),
observed_entry_count: 67,
port_runs: vec![
sample_run("Port00", "Port00", 10, 10, 1, vec!["0x00000000"]),
sample_run("Port01", "Port11", 45, 55, 11, vec!["0x00000000"]),
],
warehouse_runs: vec![
sample_run("Warehouse00", "Warehouse00", 18, 18, 1, vec!["0x00000000"]),
sample_run("Warehouse01", "Warehouse11", 56, 66, 11, vec!["0x00000000"]),
],
},
];
let aggregates = build_named_run_aggregates(&samples);
assert_eq!(aggregates.files_with_port01_11_run_at_45_55_count, 2);
assert_eq!(aggregates.files_with_warehouse01_11_run_at_56_66_count, 2);
assert_eq!(
aggregates
.port00_warehouse00_row_pair_map_counts
.get("35/43"),
Some(&1)
);
assert_eq!(
aggregates
.port00_warehouse00_row_pair_map_counts
.get("10/18"),
Some(&1)
);
assert_eq!(
aggregates
.numbered_port_warehouse_trailer_family_map_counts
.get("0x00000001"),
Some(&1)
);
assert_eq!(
aggregates
.numbered_port_warehouse_trailer_family_map_counts
.get("0x00000000"),
Some(&1)
);
assert_eq!(
aggregates
.port00_warehouse00_row_pair_map_paths
.get("35/43")
.cloned()
.unwrap_or_default(),
vec!["Louisiana.gmp".to_string()]
);
assert_eq!(
aggregates
.numbered_port_warehouse_trailer_family_map_paths
.get("0x00000000")
.cloned()
.unwrap_or_default(),
vec!["State of Germany.gmp".to_string()]
);
}
fn sample_run(
first_name: &str,
last_name: &str,
start_index: usize,
end_index: usize,
count: usize,
distinct_trailer_hex_words: Vec<&str>,
) -> RuntimeCandidateTableNamedRun {
RuntimeCandidateTableNamedRun {
prefix: first_name
.trim_end_matches(|ch: char| ch.is_ascii_digit())
.to_string(),
start_index,
end_index,
count,
first_name: first_name.to_string(),
last_name: last_name.to_string(),
start_offset: 0,
end_offset: 0,
distinct_trailer_hex_words: distinct_trailer_hex_words
.into_iter()
.map(str::to_string)
.collect(),
}
}
}