314 lines
9.3 KiB
Python
314 lines
9.3 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import hashlib
|
|
import json
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
SUMMARY_KEYS = {
|
|
"Time/Date",
|
|
"Magic",
|
|
"AddressOfEntryPoint",
|
|
"ImageBase",
|
|
"Subsystem",
|
|
"SizeOfImage",
|
|
"SizeOfCode",
|
|
"SizeOfInitializedData",
|
|
"BaseOfCode",
|
|
"BaseOfData",
|
|
}
|
|
|
|
KEYWORDS = (
|
|
"rail",
|
|
"cargo",
|
|
"map",
|
|
"save",
|
|
"scenario",
|
|
"debug",
|
|
"bink",
|
|
"mss",
|
|
"direct",
|
|
"sound",
|
|
"video",
|
|
"d3d",
|
|
)
|
|
|
|
SUBSYSTEM_HINTS = {
|
|
"startup": {
|
|
"dlls": {"KERNEL32.dll", "ADVAPI32.dll"},
|
|
"strings": ("debug", "OutputDebugStringA"),
|
|
},
|
|
"ui": {
|
|
"dlls": {"USER32.dll", "comdlg32.dll", "GDI32.dll"},
|
|
"strings": ("MessageBoxA", "CreateWindowExA"),
|
|
},
|
|
"render": {
|
|
"dlls": {"d3d8.dll"},
|
|
"strings": ("Direct3D", "Hardware T & L", "Video.win"),
|
|
},
|
|
"audio": {
|
|
"dlls": {"mss32.dll", "DSOUND.dll"},
|
|
"strings": ("DirectSound", "PaintSound.win", "Data\\Sound"),
|
|
},
|
|
"input": {
|
|
"dlls": {"DINPUT8.dll"},
|
|
"strings": ("DirectInput8Create",),
|
|
},
|
|
"network": {
|
|
"dlls": {"WS2_32.dll", "WSOCK32.dll"},
|
|
"strings": ("Direct Play", "HOST version"),
|
|
},
|
|
"filesystem": {
|
|
"dlls": {"KERNEL32.dll"},
|
|
"strings": ("Saved Games", ".\\Maps\\", "MapViewOfFile"),
|
|
},
|
|
"resource": {
|
|
"dlls": {"VERSION.dll", "ole32.dll"},
|
|
"strings": ("CargoIcons", "CargoModels", ".imb"),
|
|
},
|
|
"map": {
|
|
"dlls": set(),
|
|
"strings": ("gptGameMap", "maps\\*.gmp", "Map Description"),
|
|
},
|
|
"scenario": {
|
|
"dlls": set(),
|
|
"strings": ("Scenario Text File", "Campaign Scenario"),
|
|
},
|
|
"save": {
|
|
"dlls": set(),
|
|
"strings": ("Quicksave", "Save Game:", "Saved Games"),
|
|
},
|
|
}
|
|
|
|
|
|
def run_command(*args: str) -> str:
|
|
return subprocess.run(
|
|
args,
|
|
check=True,
|
|
text=True,
|
|
capture_output=True,
|
|
).stdout
|
|
|
|
|
|
def sha256(path: Path) -> str:
|
|
digest = hashlib.sha256()
|
|
with path.open("rb") as handle:
|
|
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
digest.update(chunk)
|
|
return digest.hexdigest()
|
|
|
|
|
|
def parse_summary(text: str) -> dict[str, str]:
|
|
summary: dict[str, str] = {}
|
|
for line in text.splitlines():
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
match = re.match(r"([A-Za-z/]+)\s+(.+)", line)
|
|
if not match:
|
|
continue
|
|
key, value = match.groups()
|
|
if key in SUMMARY_KEYS:
|
|
summary[key] = value.strip()
|
|
return summary
|
|
|
|
|
|
def parse_sections(text: str) -> list[dict[str, str]]:
|
|
sections: list[dict[str, str]] = []
|
|
lines = text.splitlines()
|
|
for index, line in enumerate(lines):
|
|
match = re.match(
|
|
r"\s*(\d+)\s+(\S+)\s+([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+([0-9a-fA-F]+)",
|
|
line,
|
|
)
|
|
if not match:
|
|
continue
|
|
idx, name, size, vma, lma, file_off = match.groups()
|
|
flags = lines[index + 1].strip() if index + 1 < len(lines) else ""
|
|
sections.append(
|
|
{
|
|
"idx": idx,
|
|
"name": name,
|
|
"size_hex": f"0x{size.lower()}",
|
|
"vma": f"0x{vma.lower()}",
|
|
"lma": f"0x{lma.lower()}",
|
|
"file_offset": f"0x{file_off.lower()}",
|
|
"flags": flags,
|
|
}
|
|
)
|
|
return sections
|
|
|
|
|
|
def parse_imports(text: str) -> tuple[list[str], list[dict[str, str]]]:
|
|
dlls: list[str] = []
|
|
functions: list[dict[str, str]] = []
|
|
current_dll = ""
|
|
in_imports = False
|
|
for raw_line in text.splitlines():
|
|
line = raw_line.rstrip()
|
|
dll_match = re.match(r"\s*DLL Name:\s+(.+)", line)
|
|
if dll_match:
|
|
current_dll = dll_match.group(1).strip()
|
|
dlls.append(current_dll)
|
|
in_imports = False
|
|
continue
|
|
if "Hint/Ord" in line and "Name" in line:
|
|
in_imports = True
|
|
continue
|
|
if not in_imports or not current_dll:
|
|
continue
|
|
fn_match = re.match(r"\s*([0-9]+)\s+(.+)", line)
|
|
if fn_match:
|
|
hint, name = fn_match.groups()
|
|
functions.append({"dll": current_dll, "hint": hint, "name": name.strip()})
|
|
elif line.strip() == "":
|
|
in_imports = False
|
|
return dlls, functions
|
|
|
|
|
|
def interesting_strings(text: str) -> list[str]:
|
|
hits: list[str] = []
|
|
seen: set[str] = set()
|
|
for line in text.splitlines():
|
|
lowered = line.lower()
|
|
if any(keyword in lowered for keyword in KEYWORDS):
|
|
stripped = line.strip()
|
|
if stripped and stripped not in seen:
|
|
hits.append(stripped)
|
|
seen.add(stripped)
|
|
return hits
|
|
|
|
|
|
def build_subsystem_inventory(dlls: list[str], strings_found: list[str]) -> str:
|
|
present_dlls = set(dlls)
|
|
lower_strings = [entry.lower() for entry in strings_found]
|
|
lines = ["# Starter Subsystem Inventory", ""]
|
|
for name, hints in SUBSYSTEM_HINTS.items():
|
|
matched_dlls = sorted(present_dlls.intersection(hints["dlls"]))
|
|
matched_strings = [
|
|
entry
|
|
for entry in strings_found
|
|
if any(marker.lower() in entry.lower() for marker in hints["strings"])
|
|
]
|
|
if not matched_dlls and not matched_strings:
|
|
continue
|
|
evidence = []
|
|
if matched_dlls:
|
|
evidence.append("DLLs: " + ", ".join(matched_dlls))
|
|
if matched_strings:
|
|
evidence.append("strings: " + "; ".join(matched_strings[:4]))
|
|
lines.append(f"## {name}")
|
|
lines.append("")
|
|
lines.append("- Evidence: " + " | ".join(evidence))
|
|
lines.append("- Status: initial hypothesis only")
|
|
lines.append("")
|
|
unknown_count = sum(1 for entry in lower_strings if "debug" in entry or "map" in entry)
|
|
lines.append("## unknown")
|
|
lines.append("")
|
|
lines.append(f"- Evidence: {unknown_count} broad strings still need manual triage")
|
|
lines.append("- Status: expected until GUI analysis identifies real call sites")
|
|
lines.append("")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def write_csv(path: Path, fieldnames: list[str], rows: list[dict[str, str]]) -> None:
|
|
with path.open("w", newline="", encoding="utf-8") as handle:
|
|
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(rows)
|
|
|
|
|
|
def main() -> int:
|
|
if len(sys.argv) != 3:
|
|
print("usage: collect_pe_artifacts.py <exe-path> <output-dir>", file=sys.stderr)
|
|
return 2
|
|
|
|
exe_path = Path(sys.argv[1]).resolve()
|
|
out_dir = Path(sys.argv[2]).resolve()
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
file_output = run_command("file", str(exe_path)).strip()
|
|
section_output = run_command("objdump", "-h", str(exe_path))
|
|
pe_output = run_command("llvm-objdump", "-p", str(exe_path))
|
|
strings_output = run_command("strings", "-n", "4", str(exe_path))
|
|
|
|
summary = parse_summary(pe_output)
|
|
sections = parse_sections(section_output)
|
|
dlls, functions = parse_imports(pe_output)
|
|
strings_found = interesting_strings(strings_output)
|
|
|
|
summary_payload = {
|
|
"path": str(exe_path),
|
|
"sha256": sha256(exe_path),
|
|
"size_bytes": exe_path.stat().st_size,
|
|
"file": file_output,
|
|
"summary": summary,
|
|
"imported_dll_count": len(dlls),
|
|
"imported_function_count": len(functions),
|
|
}
|
|
|
|
(out_dir / "binary-summary.json").write_text(
|
|
json.dumps(summary_payload, indent=2, sort_keys=True) + "\n",
|
|
encoding="utf-8",
|
|
)
|
|
write_csv(
|
|
out_dir / "sections.csv",
|
|
["idx", "name", "size_hex", "vma", "lma", "file_offset", "flags"],
|
|
sections,
|
|
)
|
|
(out_dir / "imported-dlls.txt").write_text("\n".join(dlls) + "\n", encoding="utf-8")
|
|
write_csv(out_dir / "imported-functions.csv", ["dll", "hint", "name"], functions)
|
|
(out_dir / "interesting-strings.txt").write_text(
|
|
"\n".join(strings_found) + "\n",
|
|
encoding="utf-8",
|
|
)
|
|
(out_dir / "subsystem-inventory.md").write_text(
|
|
build_subsystem_inventory(dlls, strings_found),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
entry_rva = summary.get("AddressOfEntryPoint", "")
|
|
image_base = summary.get("ImageBase", "")
|
|
entry_va = ""
|
|
if entry_rva and image_base:
|
|
entry_va = hex(int(entry_rva, 16) + int(image_base, 16))
|
|
write_csv(
|
|
out_dir / "function-map.csv",
|
|
[
|
|
"address",
|
|
"size",
|
|
"name",
|
|
"subsystem",
|
|
"calling_convention",
|
|
"prototype_status",
|
|
"source_tool",
|
|
"confidence",
|
|
"notes",
|
|
"verified_against",
|
|
],
|
|
[
|
|
{
|
|
"address": entry_va,
|
|
"size": "",
|
|
"name": "entrypoint_1_06",
|
|
"subsystem": "startup",
|
|
"calling_convention": "unknown",
|
|
"prototype_status": "unknown",
|
|
"source_tool": "llvm-objdump",
|
|
"confidence": "2",
|
|
"notes": "Seed row from PE header entrypoint; function boundary still needs GUI confirmation.",
|
|
"verified_against": f"sha256:{summary_payload['sha256']}",
|
|
}
|
|
],
|
|
)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|