Classify engine type parser families

This commit is contained in:
Jan Petykiewicz 2026-04-21 23:05:10 -07:00
commit f3c3eb7262
4 changed files with 149 additions and 17 deletions

View file

@ -146,9 +146,11 @@ pub struct EngineTypesInspectionReport {
pub unmatched_cct_file_count: usize,
pub car_side_view_resource_counts: BTreeMap<String, usize>,
pub car_auxiliary_stem_counts: BTreeMap<String, usize>,
pub car_auxiliary_stem_relation_counts: BTreeMap<String, usize>,
pub lco_companion_stem_counts: BTreeMap<String, usize>,
pub lco_body_type_label_counts: BTreeMap<String, usize>,
pub cgo_scalar_value_counts: BTreeMap<String, usize>,
pub cgo_scalar_ladder_counts: BTreeMap<String, usize>,
pub cgo_scalar_values_by_content_stem: BTreeMap<String, Vec<String>>,
pub cct_identifier_counts: BTreeMap<String, usize>,
pub cct_value_counts: BTreeMap<String, usize>,
@ -356,6 +358,11 @@ pub fn inspect_engine_types_dir(
.iter()
.filter_map(|family| family.auxiliary_stem.as_deref()),
);
let car_auxiliary_stem_relation_counts = count_owned_values(
family_entries
.iter()
.filter_map(classify_car_auxiliary_stem_relation),
);
let lco_companion_stem_counts = count_named_values(
family_entries
.iter()
@ -373,6 +380,8 @@ pub fn inspect_engine_types_dir(
);
let cgo_scalar_values_by_content_stem =
build_cgo_scalar_values_by_content_stem(cgo_reports.values());
let cgo_scalar_ladder_counts =
build_cgo_scalar_ladder_counts(cgo_scalar_values_by_content_stem.values());
let cct_identifier_counts = count_named_values(
family_entries
.iter()
@ -428,9 +437,11 @@ pub fn inspect_engine_types_dir(
.count(),
car_side_view_resource_counts,
car_auxiliary_stem_counts,
car_auxiliary_stem_relation_counts,
lco_companion_stem_counts,
lco_body_type_label_counts,
cgo_scalar_value_counts,
cgo_scalar_ladder_counts,
cgo_scalar_values_by_content_stem,
cct_identifier_counts,
cct_value_counts,
@ -674,10 +685,34 @@ fn count_owned_values(values: impl Iterator<Item = String>) -> BTreeMap<String,
counts
}
fn classify_car_auxiliary_stem_relation(family: &EngineTypeFamilyEntry) -> Option<String> {
let auxiliary_stem = family.auxiliary_stem.as_deref()?;
let internal_stem = family.internal_stem.as_deref()?;
if auxiliary_stem == internal_stem {
return Some("matches_internal_stem".to_string());
}
let internal_without_role_suffix = strip_terminal_role_letter(internal_stem)?;
if auxiliary_stem == internal_without_role_suffix {
return Some("matches_internal_without_role_suffix".to_string());
}
if auxiliary_stem.eq_ignore_ascii_case(internal_without_role_suffix) {
return Some("matches_internal_without_role_suffix_casefolded".to_string());
}
Some("distinct_auxiliary_stem".to_string())
}
fn strip_terminal_role_letter(value: &str) -> Option<&str> {
let last = value.chars().last()?;
matches!(last, 'L' | 'T' | 'l' | 't').then(|| {
let cutoff = value.len() - last.len_utf8();
&value[..cutoff]
})
}
fn build_cgo_scalar_values_by_content_stem<'a>(
reports: impl Iterator<Item = &'a EngineTypeCgoInspectionReport>,
) -> BTreeMap<String, Vec<String>> {
let mut grouped = BTreeMap::<String, Vec<String>>::new();
let mut grouped = BTreeMap::<String, Vec<f32>>::new();
for report in reports {
let Some(content_stem) = report.content_stem.as_ref() else {
continue;
@ -688,13 +723,28 @@ fn build_cgo_scalar_values_by_content_stem<'a>(
grouped
.entry(content_stem.clone())
.or_default()
.push(format!("{leading_f32:.6}"));
}
for values in grouped.values_mut() {
values.sort();
values.dedup();
.push(leading_f32);
}
grouped
.into_iter()
.map(|(content_stem, mut values)| {
values.sort_by(f32::total_cmp);
values.dedup();
(
content_stem,
values
.into_iter()
.map(|value| format!("{value:.6}"))
.collect::<Vec<_>>(),
)
})
.collect()
}
fn build_cgo_scalar_ladder_counts<'a>(
ladders: impl Iterator<Item = &'a Vec<String>>,
) -> BTreeMap<String, usize> {
count_owned_values(ladders.map(|ladder| ladder.join(" -> ")))
}
#[cfg(test)]
@ -824,9 +874,8 @@ mod tests {
#[test]
fn counts_directory_level_slot_values() {
let counts = count_named_values(
["CarSideView_1.imb", "CarSideView_1.imb", "VL80T"].into_iter(),
);
let counts =
count_named_values(["CarSideView_1.imb", "CarSideView_1.imb", "VL80T"].into_iter());
assert_eq!(counts.get("CarSideView_1.imb"), Some(&2));
assert_eq!(counts.get("VL80T"), Some(&1));
}
@ -868,9 +917,73 @@ mod tests {
);
}
#[test]
fn classifies_car_auxiliary_stem_relations() {
let identical = EngineTypeFamilyEntry {
canonical_stem: "gp7".to_string(),
car_file: None,
lco_file: None,
cgo_file: None,
cct_file: None,
primary_display_name: None,
content_name: None,
internal_stem: Some("GP7L".to_string()),
auxiliary_stem: Some("GP7L".to_string()),
side_view_resource: None,
companion_stem: None,
body_type_label: None,
cct_identifier: None,
cct_value: None,
has_matched_locomotive_pair: false,
};
let stripped = EngineTypeFamilyEntry {
internal_stem: Some("Class01L".to_string()),
auxiliary_stem: Some("Class01".to_string()),
..identical.clone()
};
let stripped_casefolded = EngineTypeFamilyEntry {
internal_stem: Some("classqjt".to_string()),
auxiliary_stem: Some("qjclasst".to_string()),
..identical.clone()
};
let distinct = EngineTypeFamilyEntry {
internal_stem: Some("ClassA1T".to_string()),
auxiliary_stem: Some("ClassA1L".to_string()),
..identical
};
assert_eq!(
classify_car_auxiliary_stem_relation(&stripped),
Some("matches_internal_without_role_suffix".to_string())
);
assert_eq!(
classify_car_auxiliary_stem_relation(&stripped_casefolded),
Some("distinct_auxiliary_stem".to_string())
);
assert_eq!(
classify_car_auxiliary_stem_relation(&distinct),
Some("distinct_auxiliary_stem".to_string())
);
}
#[test]
fn builds_cgo_scalar_ladder_counts() {
let ladders = build_cgo_scalar_ladder_counts(
[
vec!["10.000000".to_string(), "20.000000".to_string()],
vec!["10.000000".to_string(), "20.000000".to_string()],
vec!["55.000000".to_string(), "85.000000".to_string()],
]
.iter(),
);
assert_eq!(ladders.get("10.000000 -> 20.000000"), Some(&2));
assert_eq!(ladders.get("55.000000 -> 85.000000"), Some(&1));
}
#[test]
fn counts_owned_value_strings() {
let counts = count_owned_values(["13".to_string(), "13".to_string(), "4".to_string()].into_iter());
let counts =
count_owned_values(["13".to_string(), "13".to_string(), "4".to_string()].into_iter());
assert_eq!(counts.get("13"), Some(&2));
assert_eq!(counts.get("4"), Some(&1));
}

View file

@ -78,8 +78,7 @@ pub fn inspect_imb_bytes(bytes: &[u8]) -> Result<ImbInspectionReport, Box<dyn st
let target_screen_width = find_scalar_i64(&entries, "TGATargetScreenWidth");
let target_screen_height = find_scalar_i64(&entries, "TGATargetScreenHeight");
let scaleable = find_scalar_i64(&entries, "Scaleable").map(|value| value != 0);
let max_percent_of_interface_vram =
find_scalar_f64(&entries, "MaxPercentOfInterfaceVRAM");
let max_percent_of_interface_vram = find_scalar_f64(&entries, "MaxPercentOfInterfaceVRAM");
let image_rect = find_i64_quad(&entries, "ImageWH");
Ok(ImbInspectionReport {

View file

@ -12,7 +12,8 @@ This file is the short active queue for the current runtime and reverse-engineer
- The active static parser head is now the `engine_types` semantics frontier.
The repo now has structural inspectors for `.car`, `.lco`, `.cgo`, and `.cct`, but the binary side is still only partially semantic: the checked 1.05 corpus grounds `.car` fixed strings at `0x0c / 0x48 / 0x84` plus a second fixed stem slot at `0xa2` and a side-view resource name at `0xc0`, while `.lco` carries a stable primary stem at `0x04` and only conditional companion/body slots at `0x0c` and `0x12` when the leading stem slot is padded.
The next honest static work is to keep promoting those fixed lanes into stable parser fields and decide how far `.cgo` and the remaining `EngineTypes` sidecars can be grounded without overclaiming semantics.
The checked 1.05 corpus now also splits `.car` auxiliary stems into `126` direct matches, `14` role-neutral roots, and only `5` truly distinct cases, while `.cgo` collapses into five stable scalar ladders instead of arbitrary floats.
The next honest static work is to keep promoting those fixed lanes into stable parser fields, explain the five remaining distinct auxiliary-stem cases, and decide how far the `.cgo` ladders and guarded `.lco` companion lanes can be grounded without overclaiming semantics.
Preserved checked parser detail now lives in [EngineTypes parser semantics](rehost-queue/engine-types-parser-semantics-2026-04-21.md).
Preserved checked format inventory detail now lives in [RT3 format inventory](rehost-queue/format-inventory-2026-04-21.md).

View file

@ -13,6 +13,16 @@ first `.car` / `.lco` / `.cgo` / `.cct` inspector pass landed.
- `0xc0`: side-view resource name such as `CarSideView_1.imb`
- The checked 1.05 corpus (`145` `.car` files) carries all five of those `.car` slots on every
file inspected so far.
- The checked 1.05 corpus now also grounds the `0xa2` relation split:
- `126` files: `auxiliary_stem == internal_stem`
- `14` files: `auxiliary_stem == internal_stem` without a trailing role suffix (`L` / `T`)
- `5` files: truly distinct auxiliary stems
- Those five distinct auxiliary-stem cases are narrow and specific:
- `ClassA1T -> ClassA1L`
- `CramptonT -> CramptonL`
- `WhaleT -> WhaleL`
- `classqjl -> qjclassl`
- `classqjt -> qjclasst`
- `.lco` carries one always-present primary stem at `0x04`.
- `.lco` only carries meaningful secondary slots when that leading stem slot is padded:
- `0x0c`: conditional companion stem such as `VL80T` or `Zephyr`
@ -22,6 +32,13 @@ first `.car` / `.lco` / `.cgo` / `.cct` inspector pass landed.
fixed fields unless the earlier slot is actually zero-padded.
- `.cgo` looks structurally narrow right now: the checked 1.05 corpus has `37` files, all exactly
`25` bytes long, each carrying one leading scalar lane plus an inline content stem at `0x04`.
- The `.cgo` leading scalar is no longer just a loose raw count. The checked 1.05 corpus now
collapses into five stable ladders:
- `10 -> 20 -> 40 -> 80` across `6` freight-car families
- `20 -> 40 -> 80` for `Tanker`
- `55 -> 85` for `Auto_Carrier`
- `6 -> 13 -> 27 -> 53` for `Passenger`
- `7 -> 13 -> 27 -> 53` for `Mail`
- `.cct` remains the least ambiguous sidecar: current shipped files still look like narrow one-row
text metadata.
@ -33,6 +50,7 @@ first `.car` / `.lco` / `.cgo` / `.cct` inspector pass landed.
- internal stem
- auxiliary stem slot
- side-view resource name
- auxiliary-stem relation counts across the shipped corpus
- `.lco`
- full internal stem
- conditional companion stem slot
@ -41,14 +59,15 @@ first `.car` / `.lco` / `.cgo` / `.cct` inspector pass landed.
- `.cgo`
- leading scalar lane
- content stem
- scalar ladder counts by shared cargo-car family
- `.cct`
- tokenized identifier/value row
## Remaining Static Questions
- `.car`
- what the `0xa2` auxiliary stem really represents across locomotive, tender, and freight-car
families: alias root, image key, or alternate content stem
- what the `0xa2` auxiliary stem really represents in the five remaining distinct cases:
alternate content root, paired tender/loco image root, or a narrower foreign-display alias
- whether the trailing side-view resource can be tied cleanly to `.imb` metadata without
inventing frontend semantics
- `.lco`
@ -57,8 +76,8 @@ first `.car` / `.lco` / `.cgo` / `.cct` inspector pass landed.
- how much of the early numeric lane block can be promoted from raw `u32/f32` views into stable
typed semantics without dynamic evidence
- `.cgo`
- whether the leading scalar is enough to justify a named typed field, or whether it should stay
a conservative raw scalar until more binary/code correlation exists
- whether the leading scalar ladders are enough to justify a named typed field, or whether they
should stay conservative report-only ladders until more binary/code correlation exists
## Next Static Parser Work