rrt/crates/rrt-runtime/src/inspect/lng.rs

270 lines
8.5 KiB
Rust

use std::collections::{BTreeMap, BTreeSet};
use std::fs;
use std::path::Path;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct LngInspectionEntry {
pub line_number: usize,
pub kind: String,
pub string_id: Option<u32>,
pub style_level: Option<u32>,
pub raw_text: String,
pub normalized_text: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct LngMalformedLine {
pub line_number: usize,
pub raw_line: String,
pub reason: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct LngInspectionReport {
pub format_family: String,
pub line_count: usize,
pub entry_count: usize,
pub string_entry_count: usize,
pub styled_entry_count: usize,
pub comment_count: usize,
pub blank_line_count: usize,
pub duplicate_id_count: usize,
pub duplicate_ids: Vec<u32>,
pub malformed_line_count: usize,
pub highest_string_id: Option<u32>,
pub notes: Vec<String>,
pub entries: Vec<LngInspectionEntry>,
pub malformed_lines: Vec<LngMalformedLine>,
}
pub fn inspect_lng_file(path: &Path) -> Result<LngInspectionReport, Box<dyn std::error::Error>> {
let bytes = fs::read(path)?;
inspect_lng_bytes(&bytes)
}
pub fn inspect_lng_bytes(bytes: &[u8]) -> Result<LngInspectionReport, Box<dyn std::error::Error>> {
let text = decode_windows_1252(bytes);
let mut entries = Vec::new();
let mut malformed_lines = Vec::new();
let mut string_id_counts = BTreeMap::<u32, usize>::new();
let mut comment_count = 0usize;
let mut blank_line_count = 0usize;
let mut string_entry_count = 0usize;
let mut styled_entry_count = 0usize;
for (index, raw_line) in text.lines().enumerate() {
let line_number = index + 1;
let trimmed = raw_line.trim();
if trimmed.is_empty() {
blank_line_count += 1;
continue;
}
if trimmed.starts_with(';') {
comment_count += 1;
continue;
}
if let Some(entry) = parse_string_entry(line_number, raw_line) {
string_entry_count += 1;
if let Some(string_id) = entry.string_id {
*string_id_counts.entry(string_id).or_default() += 1;
}
entries.push(entry);
continue;
}
if let Some(entry) = parse_styled_entry(line_number, raw_line) {
styled_entry_count += 1;
entries.push(entry);
continue;
}
malformed_lines.push(LngMalformedLine {
line_number,
raw_line: raw_line.to_string(),
reason: "line is neither a quoted string-id row nor a styled credits row".to_string(),
});
}
let duplicate_ids = string_id_counts
.into_iter()
.filter_map(|(string_id, count)| (count > 1).then_some(string_id))
.collect::<Vec<_>>();
let highest_string_id = entries.iter().filter_map(|entry| entry.string_id).max();
let format_kinds = entries
.iter()
.map(|entry| entry.kind.as_str())
.collect::<BTreeSet<_>>();
let format_family = match (format_kinds.contains("string"), format_kinds.contains("styled")) {
(true, false) => "quoted-string-table".to_string(),
(false, true) => "styled-credits-lines".to_string(),
(true, true) => "mixed-language-table".to_string(),
(false, false) => "unclassified-language-text".to_string(),
};
let mut notes = Vec::new();
notes.push(
"Quoted string rows preserve both the raw escape spelling and a normalized text view where `\\n` becomes a line break.".to_string(),
);
if format_kinds.contains("styled") {
notes.push(
"Styled rows use the observed `*<level>` credits format and preserve the style level separately from the rendered text.".to_string(),
);
}
if !duplicate_ids.is_empty() {
notes.push("Duplicate string ids are preserved explicitly instead of silently overwriting earlier rows.".to_string());
}
Ok(LngInspectionReport {
format_family,
line_count: text.lines().count(),
entry_count: entries.len(),
string_entry_count,
styled_entry_count,
comment_count,
blank_line_count,
duplicate_id_count: duplicate_ids.len(),
duplicate_ids,
malformed_line_count: malformed_lines.len(),
highest_string_id,
notes,
entries,
malformed_lines,
})
}
fn parse_string_entry(line_number: usize, raw_line: &str) -> Option<LngInspectionEntry> {
let trimmed = raw_line.trim_start();
let digit_len = trimmed.chars().take_while(|ch| ch.is_ascii_digit()).count();
if digit_len == 0 {
return None;
}
let string_id = trimmed[..digit_len].parse().ok()?;
let remainder = trimmed[digit_len..].trim_start();
let raw_text = parse_quoted_payload(remainder)?;
Some(LngInspectionEntry {
line_number,
kind: "string".to_string(),
string_id: Some(string_id),
style_level: None,
normalized_text: normalize_lng_text(&raw_text),
raw_text,
})
}
fn parse_styled_entry(line_number: usize, raw_line: &str) -> Option<LngInspectionEntry> {
let trimmed = raw_line.trim_start();
let remainder = trimmed.strip_prefix('*')?;
let digit_len = remainder
.chars()
.take_while(|ch| ch.is_ascii_digit())
.count();
if digit_len == 0 {
return None;
}
let style_level = remainder[..digit_len].parse().ok()?;
let raw_text = remainder[digit_len..].trim_start().to_string();
Some(LngInspectionEntry {
line_number,
kind: "styled".to_string(),
string_id: None,
style_level: Some(style_level),
normalized_text: normalize_lng_text(&raw_text),
raw_text,
})
}
fn parse_quoted_payload(text: &str) -> Option<String> {
let trimmed = text.trim();
if !(trimmed.starts_with('"') && trimmed.ends_with('"') && trimmed.len() >= 2) {
return None;
}
Some(trimmed[1..trimmed.len() - 1].to_string())
}
fn normalize_lng_text(text: &str) -> String {
text.replace("\\n", "\n")
}
fn decode_windows_1252(bytes: &[u8]) -> String {
bytes.iter().map(|byte| decode_windows_1252_byte(*byte)).collect()
}
fn decode_windows_1252_byte(byte: u8) -> char {
match byte {
0x80 => '\u{20AC}',
0x82 => '\u{201A}',
0x83 => '\u{0192}',
0x84 => '\u{201E}',
0x85 => '\u{2026}',
0x86 => '\u{2020}',
0x87 => '\u{2021}',
0x88 => '\u{02C6}',
0x89 => '\u{2030}',
0x8A => '\u{0160}',
0x8B => '\u{2039}',
0x8C => '\u{0152}',
0x8E => '\u{017D}',
0x91 => '\u{2018}',
0x92 => '\u{2019}',
0x93 => '\u{201C}',
0x94 => '\u{201D}',
0x95 => '\u{2022}',
0x96 => '\u{2013}',
0x97 => '\u{2014}',
0x98 => '\u{02DC}',
0x99 => '\u{2122}',
0x9A => '\u{0161}',
0x9B => '\u{203A}',
0x9C => '\u{0153}',
0x9E => '\u{017E}',
0x9F => '\u{0178}',
_ => byte as char,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parses_standard_string_rows_and_comments() {
let report = inspect_lng_bytes(b"; comment\n 10 \"Cancel\"\n11\t\"Line\\nBreak\"\n")
.expect("lng should parse");
assert_eq!(report.format_family, "quoted-string-table");
assert_eq!(report.comment_count, 1);
assert_eq!(report.string_entry_count, 2);
assert_eq!(report.highest_string_id, Some(11));
assert_eq!(report.entries[1].normalized_text, "Line\nBreak");
}
#[test]
fn parses_styled_credit_rows() {
let report = inspect_lng_bytes(b"*3Railroad Tycoon 3\n*2Development\nPopTop\n")
.expect("lng should parse");
assert_eq!(report.format_family, "styled-credits-lines");
assert_eq!(report.styled_entry_count, 2);
assert_eq!(report.malformed_line_count, 1);
assert_eq!(report.entries[0].style_level, Some(3));
assert_eq!(report.entries[0].raw_text, "Railroad Tycoon 3");
}
#[test]
fn reports_duplicate_string_ids() {
let report = inspect_lng_bytes(b"1 \"A\"\n1 \"B\"\n").expect("lng should parse");
assert_eq!(report.duplicate_id_count, 1);
assert_eq!(report.duplicate_ids, vec![1]);
}
#[test]
fn decodes_windows_1252_text() {
let report = inspect_lng_bytes(b"1 \"Wait\x85\"\n").expect("lng should parse");
assert_eq!(report.entries[0].raw_text, "Wait…");
}
}