Better file format 1

2025-04-25 21:30:11 -07:00
parent f5cb818123
commit b20846c797
14 changed files with 213 additions and 688 deletions
--- a/analyzer/.vscode/settings.json
+++ b/analyzer/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.analysis.typeCheckingMode": "standard"
+}
--- a/analyzer/dumps/starting_state.dat
+++ b/analyzer/dumps/starting_state.dat
--- a/analyzer/dumps/data.dat
+++ b/analyzer/dumps/data.dat
--- a/analyzer/dumps/rdata.dat
+++ b/analyzer/dumps/rdata.dat
--- a/analyzer/dumps/text.dat
+++ b/analyzer/dumps/text.dat
--- a/analyzer/format.txt
+++ b/analyzer/format.txt
@@ -0,0 +1,7 @@
+[entry point]
+[number of libraries]
+    <Library> <Import1> <Import1Addr> <Import2> <Import2Addr> 00: Import the following things. (Terminate on 00)
+    
+[number of bytes in starting state as a uint32_t]
+    00 [00-ff] <bytes>: Use the following 00-ff bytes literally
+    [01-ff] <byte>: Repeat the next byte 02 to ff times
--- a/analyzer/main.py
+++ b/analyzer/main.py
@@ -1,80 +1,124 @@
 import base64
+from dataclasses import dataclass
 from io import BytesIO
 import json
+from typing import Generator
 import pefile

-def main():
-    subject = pefile.PE("subjects\\main.exe")
+@dataclass
+class Import:
+    library: bytes
+    procedures: list[tuple[bytes, int]]

-    def _single_or_none(x):
-        items = [i for i in x]
-        if len(items) == 0:
-            return None

-        assert len(items) == 1
+@dataclass
+class Binary(object):
+    starting_state: bytes
+    entry_point: int
+    imports: list[Import]
+
+
+def _single_or_none[T](ts: Generator[T]) -> T | None:
+    items = [t for t in ts]
+    if len(items) == 0:
+        return None
+    
+    if len(items) == 1:
+        return items[0]
+    
+    raise ValueError(f"expected 1 or 0, got {len(items)}")
+
+def _single[T](ts: Generator[T]) -> T:
+    items = [t for t in ts]
+    if len(items) == 1:
        return items[0]

-    def _dump(fname, section):
-        with open(fname, "wb") as f:
-            if section is not None:
-                if isinstance(section, bytes):
-                    f.write(section)
-                else:
-                    f.write(section.get_data())
+    raise ValueError(f"expected 1, got {len(items)}")

-    for i in subject.sections:
-        print(i)

-    text_section = _single_or_none(i for i in subject.sections if i.Name == b".text\0\0\0")
+def _create_binary(subject: pefile.PE) -> Binary:
+    optional_header = subject.OPTIONAL_HEADER
+    assert isinstance(optional_header, pefile.Structure)
+    text_section = _single(i for i in subject.sections if i.Name == b".text\0\0\0")
    data_section = _single_or_none(i for i in subject.sections if i.Name == b".data\0\0\0")
    rdata_section = _single_or_none(i for i in subject.sections if i.Name == b".rdata\0\0")

-    _dump("dumps\\text.dat", text_section)
-    _dump("dumps\\data.dat", data_section)
-    _dump("dumps\\rdata.dat", rdata_section)
-
    relevant_sections = [section for section in (text_section, data_section, rdata_section) if section is not None]
    if len(relevant_sections) == 0:
        raise ValueError("no sections to plot")
+    print([(i.VirtualAddress, i) for i in relevant_sections])
    min_address = min(i.VirtualAddress for i in relevant_sections)
    max_address = max(_round_up_to_page(i.VirtualAddress + i.SizeOfRawData) for i in relevant_sections)

-    print(min_address, max_address)
    buffer = bytearray(max_address - min_address)
    for section in relevant_sections:
        data = section.get_data()  # TODO: De-pad the text section from 0xccs
        start = section.VirtualAddress - min_address
        buffer[start:start+len(data)] = data
-    buffer = bytes(buffer)
-    
-    _dump("dumps\\starting_state.dat", buffer)

-    binary = {
-        "startingState": base64.b64encode(buffer).decode("utf8"),
-        "imports": [],
-    }
+    starting_state = bytes(buffer)

-    # find imports
-    # print(subject)
-    # print(dir(subject))
-    for entry in subject.DIRECTORY_ENTRY_IMPORT:
+    entry_point_rva = getattr(optional_header, "AddressOfEntryPoint")
+    print(entry_point_rva)
+    entry_point = (entry_point_rva - min_address) 

-        # print(entry.dll)
+    imports: list[Import] = []
+    for entry in getattr(subject, "DIRECTORY_ENTRY_IMPORT"):
+        library: bytes = entry.dll
+        procedures: list[tuple[bytes, int]] = []
        for imp in entry.imports:
            # print(dir(imp))
-            import_address = imp.address - subject.OPTIONAL_HEADER.ImageBase - min_address
-            print(hex(import_address), imp.name)
-            binary["imports"].append({
-                "dll": entry.dll.decode("utf8"),
-                "symbol": imp.name.decode("utf8"),
-                "address": import_address,
-            })
+            import_address_rva = imp.address - getattr(optional_header, "ImageBase") 
+            import_address = import_address_rva - min_address
+            procedures.append((imp.name, import_address))

-    entry_point_rva = subject.OPTIONAL_HEADER.AddressOfEntryPoint
-    binary["entryPoint"] = entry_point_rva - min_address
-    with open("binaries/main.json", "wt") as f:
-        f.write(json.dumps(binary, indent=4))
+        imports.append(Import(library, procedures))

+    return Binary(
+        starting_state=starting_state,
+        entry_point=entry_point,
+        imports=imports,
+    )
+
+
+def _encode_binary(binary: Binary) -> bytes:
+    out = BytesIO()
+
+    def _write_u32(n: int):
+        out.write(n.to_bytes(4, "little", signed=False))
+
+    def _write_u8(n: int):
+        out.write(n.to_bytes(1, "little", signed=False))
+
+    def _write_zt(s: bytes):
+        out.write(s)
+        _write_u8(0)
+    
+    _write_u32(binary.entry_point)
+    for i in binary.imports:
+        print(i.library)
+        _write_zt(i.library)
+        print(i.procedures)
+        for (procedure, address) in i.procedures:
+            _write_zt(procedure)
+            _write_u32(address)
+        _write_u8(0)
+    _write_u8(0)
+
+    _write_u32(len(binary.starting_state))
+    # TODO: No RLE for now
+    for b in binary.starting_state:
+        _write_u8(b)
+
+    return out.getbuffer()
+
+def main():
+    subject = pefile.PE("subjects\\main.exe")
+
+    binary = _create_binary(subject)
+    code = _encode_binary(binary)
+    with open("binaries\\main.dat", "wb") as f:
+        f.write(code)

 def _round_up_to_page(x: int):
    # TODO: Is this the page size on x64? I think it is
--- a/analyzer/poetry.lock
+++ b/analyzer/poetry.lock
@@ -13,5 +13,5 @@ files = [

 [metadata]
 lock-version = "2.0"
-python-versions = "^3.10"
-content-hash = "6f28e4dc3bf3b09c57354693b75dc7975b70a7aac2ee7c83fc81b0058520d7f9"
+python-versions = "^3.13"
+content-hash = "8e680dad2071f9d7a37ca34d4fd6da67ba3922e0de45f0442c7b38d07f8fa9f0"
--- a/analyzer/pyproject.toml
+++ b/analyzer/pyproject.toml
@@ -6,7 +6,7 @@ authors = ["Nyeogmi <economicsbat@gmail.com>"]
 readme = "README.md"

 [tool.poetry.dependencies]
-python = "^3.10"
+python = "^3.13"
 pefile = "^2024.8.26"