[gdsii_arrow] further improvements to speed

2026-04-02 15:47:41 -07:00 · 2026-04-02 15:47:41 -07:00 · 28562f73f6
commit 28562f73f6
parent d387066228
3 changed files with 235 additions and 61 deletions
--- a/examples/profile_gdsii_readers.py
+++ b/examples/profile_gdsii_readers.py
@ -5,17 +5,18 @@ import importlib
 import json
 import time
 from pathlib import Path
 from typing import Any
 from masque import LibraryError
-READERS = {
+READERS: dict[str, tuple[str, tuple[str, ...]]] = {
-    'gdsii': ('masque.file.gdsii', 'readfile'),
+    'gdsii': ('masque.file.gdsii', ('readfile',)),
-    'gdsii_arrow': ('masque.file.gdsii_arrow', 'readfile'),
+    'gdsii_arrow': ('masque.file.gdsii_arrow', ('readfile', 'arrow_import', 'arrow_convert')),
    }
-def _summarize(path: Path, elapsed_s: float, info: dict[str, object], lib: object) -> dict[str, object]:
+def _summarize_library(path: Path, elapsed_s: float, info: dict[str, object], lib: object) -> dict[str, object]:
    assert hasattr(lib, '__len__')
    assert hasattr(lib, 'tops')
    tops = lib.tops()      # type: ignore[no-any-return, attr-defined]
@ -34,12 +35,50 @@ def _summarize(path: Path, elapsed_s: float, info: dict[str, object], lib: objec
        }
 def _summarize_arrow_import(path: Path, elapsed_s: float, arrow_arr: Any) -> dict[str, object]:
    libarr = arrow_arr[0]
    return {
        'path': str(path),
        'elapsed_s': elapsed_s,
        'arrow_rows': len(arrow_arr),
        'library_name': libarr['lib_name'].as_py(),
        'cell_count': len(libarr['cells']),
        'layer_count': len(libarr['layers']),
        }
 def _profile_stage(module: Any, stage: str, path: Path) -> dict[str, object]:
    start = time.perf_counter()
    if stage == 'readfile':
        lib, info = module.readfile(path)
        elapsed_s = time.perf_counter() - start
        return _summarize_library(path, elapsed_s, info, lib)
    if stage == 'arrow_import':
        arrow_arr = module._read_to_arrow(path)
        elapsed_s = time.perf_counter() - start
        return _summarize_arrow_import(path, elapsed_s, arrow_arr)
    if stage == 'arrow_convert':
        arrow_arr = module._read_to_arrow(path)
        libarr = arrow_arr[0]
        start = time.perf_counter()
        lib, info = module.read_arrow(libarr)
        elapsed_s = time.perf_counter() - start
        return _summarize_library(path, elapsed_s, info, lib)
    raise ValueError(f'Unsupported stage {stage!r}')
 def build_arg_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description='Profile GDS readers with a stable end-to-end workload.')
    parser.add_argument('--reader', choices=sorted(READERS), required=True)
    parser.add_argument('--stage', default='readfile')
    parser.add_argument('--path', type=Path, required=True)
    parser.add_argument('--warmup', type=int, default=1)
    parser.add_argument('--repeat', type=int, default=1)
    parser.add_argument('--output-json', type=Path)
    return parser
@ -47,26 +86,32 @@ def main(argv: list[str] | None = None) -> int:
    parser = build_arg_parser()
    args = parser.parse_args(argv)
-    module_name, attr_name = READERS[args.reader]
+    module_name, stages = READERS[args.reader]
-    readfile = getattr(importlib.import_module(module_name), attr_name)
+    if args.stage not in stages:
        parser.error(f'reader {args.reader!r} only supports stages: {", ".join(stages)}')
    module = importlib.import_module(module_name)
    path = args.path.expanduser().resolve()
    for _ in range(args.warmup):
-        readfile(path)
+        _profile_stage(module, args.stage, path)
    runs = []
    for _ in range(args.repeat):
-        start = time.perf_counter()
+        runs.append(_profile_stage(module, args.stage, path))
        lib, info = readfile(path)
        elapsed_s = time.perf_counter() - start
        runs.append(_summarize(path, elapsed_s, info, lib))
-    print(json.dumps({
+    payload = {
        'reader': args.reader,
        'stage': args.stage,
        'warmup': args.warmup,
        'repeat': args.repeat,
        'runs': runs,
-        }, indent=2, sort_keys=True))
+        }
    rendered = json.dumps(payload, indent=2, sort_keys=True)
    if args.output_json is not None:
        args.output_json.parent.mkdir(parents=True, exist_ok=True)
        args.output_json.write_text(rendered + '\n')
    print(rendered)
    return 0
--- a/masque/file/gdsii_arrow.py
+++ b/masque/file/gdsii_arrow.py
@ -57,6 +57,8 @@ ffi.cdef('void read_path(char* path, struct ArrowArray* array, struct ArrowSchem
 clib: Any | None = None
 ZERO_OFFSET = numpy.zeros(2)
 path_cap_map = {
    0: Path.Cap.Flush,
@ -263,6 +265,29 @@ def read_arrow(
            )
        return elem
    def get_boundary_batches(libarr: pyarrow.Array) -> dict[str, Any]:
        batches = libarr['cells'].values.field('boundary_batches')
        return dict(
            offsets = batches.offsets.to_numpy(),
            layer_inds = batches.values.field('layer').to_numpy(),
            vert_arr = batches.values.field('vertices').values.to_numpy().reshape((-1, 2)),
            vert_off = batches.values.field('vertices').offsets.to_numpy() // 2,
            poly_off = batches.values.field('vertex_offsets').offsets.to_numpy(),
            poly_offsets = batches.values.field('vertex_offsets').values.to_numpy(),
            )
    def get_boundary_props(libarr: pyarrow.Array) -> dict[str, Any]:
        boundaries = libarr['cells'].values.field('boundary_props')
        return dict(
            offsets = boundaries.offsets.to_numpy(),
            layer_inds = boundaries.values.field('layer').to_numpy(),
            vert_arr = boundaries.values.field('vertices').values.to_numpy().reshape((-1, 2)),
            vert_off = boundaries.values.field('vertices').offsets.to_numpy() // 2,
            prop_off = boundaries.values.field('properties').offsets.to_numpy(),
            prop_key = boundaries.values.field('properties').values.field('key').to_numpy(),
            prop_val = boundaries.values.field('properties').values.field('value').to_pylist(),
            )
    rf = libarr['cells'].values.field('refs')
    refs = dict(
        offsets = rf.offsets.to_numpy(),
@ -292,10 +317,9 @@ def read_arrow(
        )
    elements = dict(
-        boundaries = get_geom(libarr, 'boundaries'),
+        boundary_batches = get_boundary_batches(libarr),
        boundary_props = get_boundary_props(libarr),
        paths = get_geom(libarr, 'paths'),
        boxes = get_geom(libarr, 'boxes'),
        nodes = get_geom(libarr, 'nodes'),
        texts = texts,
        refs = refs,
        )
@ -320,7 +344,8 @@ def read_arrow(
    for cc in range(len(libarr['cells'])):
        name = cell_names[cell_ids[cc]]
        pat = Pattern()
-        _boundaries_to_polygons(pat, global_args, elements['boundaries'], cc)
+        _boundary_batches_to_polygons(pat, global_args, elements['boundary_batches'], cc)
        _boundary_props_to_polygons(pat, global_args, elements['boundary_props'], cc)
        _gpaths_to_mpaths(pat, global_args, elements['paths'], cc)
        _grefs_to_mrefs(pat, global_args, elements['refs'], cc)
        _texts_to_labels(pat, global_args, elements['texts'], cc)
@ -366,6 +391,7 @@ def _grefs_to_mrefs(
    elem_rep_xy1 = elem['rep_xy1'][elem_slc][:elem_count]
    elem_rep_counts = elem['rep_counts'][elem_slc][:elem_count]
    rep_valid = elem['rep_valid'][elem_slc][:elem_count]
    raw_mode = global_args['raw_mode']
    for ee in range(elem_count):
@ -380,9 +406,15 @@ def _grefs_to_mrefs(
            a_vector = elem_rep_xy0[ee]
            b_vector = elem_rep_xy1[ee]
            a_count, b_count = elem_rep_counts[ee]
            if raw_mode:
                rep = Grid._from_raw(a_vector=a_vector, b_vector=b_vector, a_count=a_count, b_count=b_count)
            else:
                rep = Grid(a_vector=a_vector, b_vector=b_vector, a_count=a_count, b_count=b_count)
        annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
        if raw_mode:
            ref = Ref._from_raw(offset=offset, mirrored=mirr, rotation=rot, scale=mag, repetition=rep, annotations=annotations)
        else:
            ref = Ref(offset=offset, mirrored=mirr, rotation=rot, scale=mag, repetition=rep, annotations=annotations)
        pat.refs[target].append(ref)
@ -406,6 +438,7 @@ def _texts_to_labels(
    elem_xy = xy[elem_slc][:elem_count]
    elem_layer_inds = layer_inds[elem_slc][:elem_count]
    elem_strings = elem['string'][elem_slc][:elem_count]
    raw_mode = global_args['raw_mode']
    for ee in range(elem_count):
        layer = layer_tups[elem_layer_inds[ee]]
@ -413,6 +446,9 @@ def _texts_to_labels(
        string = elem_strings[ee]
        annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
        if raw_mode:
            mlabel = Label._from_raw(string=string, offset=offset, annotations=annotations)
        else:
            mlabel = Label(string=string, offset=offset, annotations=annotations)
        pat.labels[layer].append(mlabel)
@ -439,7 +475,6 @@ def _gpaths_to_mpaths(
    elem_path_types = elem['path_type'][elem_slc][:elem_count]
    elem_extensions = elem['extensions'][elem_slc][:elem_count]
    zeros = numpy.zeros((elem_count, 2))
    raw_mode = global_args['raw_mode']
    for ee in range(elem_count):
        layer = layer_tups[elem_layer_inds[ee]]
@ -453,64 +488,77 @@ def _gpaths_to_mpaths(
            cap_extensions = None
        annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
-        path = Path(vertices=vertices, offset=zeros[ee], annotations=annotations, raw=raw_mode,
+        path = Path(vertices=vertices, offset=ZERO_OFFSET, annotations=annotations, raw=raw_mode,
            width=width, cap=cap,cap_extensions=cap_extensions)
        pat.shapes[layer].append(path)
-def _boundaries_to_polygons(
+def _boundary_batches_to_polygons(
        pat: Pattern,
        global_args: dict[str, Any],
        elem: dict[str, Any],
        cc: int,
        ) -> None:
    elem_off = elem['offsets']      # which elements belong to each cell
-    xy_val = elem['xy_arr']
+    vert_arr = elem['vert_arr']
    vert_off = elem['vert_off']
    layer_inds = elem['layer_inds']
    layer_tups = global_args['layer_tups']
    poly_off = elem['poly_off']
    poly_offsets = elem['poly_offsets']
    batch_count = elem_off[cc + 1] - elem_off[cc]
    if batch_count == 0:
        return
    elem_slc = slice(elem_off[cc], elem_off[cc] + batch_count + 1)   # +1 to capture ending location for last elem
    elem_vert_off = vert_off[elem_slc]
    elem_poly_off = poly_off[elem_slc]
    elem_layer_inds = layer_inds[elem_slc][:batch_count]
    raw_mode = global_args['raw_mode']
    for bb in range(batch_count):
        layer = layer_tups[elem_layer_inds[bb]]
        vertices = vert_arr[elem_vert_off[bb]:elem_vert_off[bb + 1]]
        vertex_offsets = numpy.asarray(poly_offsets[elem_poly_off[bb]:elem_poly_off[bb + 1]], dtype=numpy.intp)
        if vertex_offsets.size == 1:
            poly = Polygon(vertices=vertices, offset=ZERO_OFFSET, annotations=None, raw=raw_mode)
            pat.shapes[layer].append(poly)
        else:
            polys = PolyCollection(vertex_lists=vertices, vertex_offsets=vertex_offsets, offset=ZERO_OFFSET, annotations=None, raw=raw_mode)
            pat.shapes[layer].append(polys)
 def _boundary_props_to_polygons(
        pat: Pattern,
        global_args: dict[str, Any],
        elem: dict[str, Any],
        cc: int,
        ) -> None:
    elem_off = elem['offsets']
    vert_arr = elem['vert_arr']
    vert_off = elem['vert_off']
    layer_inds = elem['layer_inds']
    layer_tups = global_args['layer_tups']
    prop_key = elem['prop_key']
    prop_val = elem['prop_val']
    elem_count = elem_off[cc + 1] - elem_off[cc]
-    elem_slc = slice(elem_off[cc], elem_off[cc] + elem_count + 1)   # +1 to capture ending location for last elem
+    if elem_count == 0:
-    xy_offs = elem['xy_off'][elem_slc]      # which xy coords belong to each element
+        return
-    xy_counts = xy_offs[1:] - xy_offs[:-1]
+
-    prop_offs = elem['prop_off'][elem_slc]  # which props belong to each element
+    elem_slc = slice(elem_off[cc], elem_off[cc] + elem_count + 1)
-    prop_counts = prop_offs[1:] - prop_offs[:-1]
+    elem_vert_off = vert_off[elem_slc]
    prop_offs = elem['prop_off'][elem_slc]
    elem_layer_inds = layer_inds[elem_slc][:elem_count]
    order = numpy.argsort(elem_layer_inds, stable=True)
    unilayer_inds, unilayer_first, unilayer_count = numpy.unique(elem_layer_inds, return_index=True, return_counts=True)
    zeros = numpy.zeros((elem_count, 2))
    raw_mode = global_args['raw_mode']
-    for layer_ind, ff, nn in zip(unilayer_inds, unilayer_first, unilayer_count, strict=True):
+    for ee in range(elem_count):
        ee_inds = order[ff:ff + nn]
        layer = layer_tups[layer_ind]
        propless_mask = prop_counts[ee_inds] == 0
        poly_count_on_layer = propless_mask.sum()
        if poly_count_on_layer == 1:
            propless_mask[:] = 0        # Never make a 1-element collection
        elif poly_count_on_layer > 1:
            propless_vert_counts = xy_counts[ee_inds[propless_mask]] - 1        # -1 to drop closing point
            vertex_lists = numpy.empty((propless_vert_counts.sum(), 2), dtype=numpy.float64)
            vertex_offsets = numpy.cumsum(numpy.concatenate([[0], propless_vert_counts]))
            for ii, ee in enumerate(ee_inds[propless_mask]):
                vo = vertex_offsets[ii]
                vertex_lists[vo:vo + propless_vert_counts[ii]] = xy_val[xy_offs[ee]:xy_offs[ee + 1] - 1]
            polys = PolyCollection(vertex_lists=vertex_lists, vertex_offsets=vertex_offsets, offset=zeros[ee])
            pat.shapes[layer].append(polys)
        # Handle single polygons
        for ee in ee_inds[~propless_mask]:
        layer = layer_tups[elem_layer_inds[ee]]
-            vertices = xy_val[xy_offs[ee]:xy_offs[ee + 1] - 1]    # -1 to drop closing point
+        vertices = vert_arr[elem_vert_off[ee]:elem_vert_off[ee + 1]]
        annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
-            poly = Polygon(vertices=vertices, offset=zeros[ee], annotations=annotations, raw=raw_mode)
+        poly = Polygon(vertices=vertices, offset=ZERO_OFFSET, annotations=annotations, raw=raw_mode)
        pat.shapes[layer].append(poly)
--- a/masque/test/test_gdsii_arrow.py
+++ b/masque/test/test_gdsii_arrow.py
@ -5,6 +5,7 @@ import pytest
 pytest.importorskip('pyarrow')
 from .. import Ref, Label
 from ..library import Library
 from ..pattern import Pattern
 from ..repetition import Grid
@ -119,7 +120,10 @@ def _make_arrow_test_library() -> Library:
    leaf = Pattern()
    leaf.polygon((1, 0), vertices=[[0, 0], [10, 0], [10, 10], [0, 10]], annotations={'1': ['leaf-poly']})
    leaf.polygon((2, 0), vertices=[[40, 0], [50, 0], [50, 10], [40, 10]])
    leaf.polygon((1, 0), vertices=[[20, 0], [30, 0], [30, 10], [20, 10]])
    leaf.polygon((1, 0), vertices=[[80, 0], [90, 0], [90, 10], [80, 10]])
    leaf.polygon((2, 0), vertices=[[60, 0], [70, 0], [70, 10], [60, 10]], annotations={'18': ['leaf-poly-2']})
    leaf.label((10, 0), string='LEAF', offset=(3, 4), annotations={'10': ['leaf-label']})
    lib['leaf'] = leaf
@ -178,3 +182,80 @@ def test_gdsii_arrow_reads_small_perf_fixture(tmp_path: Path) -> None:
    assert len(lib) == manifest.cells
    assert 'TOP' in lib
    assert sum(len(refs) for refs in lib['TOP'].refs.values()) > 0
 def test_gdsii_arrow_boundary_batch_schema(tmp_path: Path) -> None:
    lib = _make_arrow_test_library()
    gds_file = tmp_path / 'arrow_batches.gds'
    gdsii.writefile(lib, gds_file, meters_per_unit=1e-9)
    libarr = gdsii_arrow._read_to_arrow(gds_file)[0]
    cells = libarr['cells'].values
    cell_ids = cells.field('id').to_numpy()
    cell_names = libarr['cell_names'].as_py()
    layer_table = [
        ((int(layer) >> 16) & 0xFFFF, int(layer) & 0xFFFF)
        for layer in libarr['layers'].values.to_numpy()
        ]
    leaf_index = next(ii for ii, cell_id in enumerate(cell_ids) if cell_names[cell_id] == 'leaf')
    boundary_batches = cells.field('boundary_batches')[leaf_index].as_py()
    boundary_props = cells.field('boundary_props')[leaf_index].as_py()
    assert len(boundary_batches) == 2
    assert len(boundary_props) == 2
    batch_by_layer = {tuple(layer_table[entry['layer']]): entry for entry in boundary_batches}
    assert batch_by_layer[(1, 0)]['vertex_offsets'] == [0, 4]
    assert len(batch_by_layer[(1, 0)]['vertices']) == 16
    assert batch_by_layer[(2, 0)]['vertex_offsets'] == [0]
    assert len(batch_by_layer[(2, 0)]['vertices']) == 8
    props_by_layer = {tuple(layer_table[entry['layer']]): entry for entry in boundary_props}
    assert sorted(props_by_layer) == [(1, 0), (2, 0)]
    assert props_by_layer[(1, 0)]['properties'][0]['value'] == 'leaf-poly'
    assert props_by_layer[(2, 0)]['properties'][0]['value'] == 'leaf-poly-2'
 def test_raw_ref_grid_label_constructors_match_public() -> None:
    raw_grid = Grid._from_raw(
        a_vector=numpy.array([20, 0]),
        a_count=3,
        b_vector=numpy.array([0, 30]),
        b_count=2,
        )
    public_grid = Grid(a_vector=(20, 0), a_count=3, b_vector=(0, 30), b_count=2)
    assert raw_grid == public_grid
    raw_ref = Ref._from_raw(
        offset=numpy.array([100, 200]),
        rotation=numpy.pi / 2,
        mirrored=True,
        scale=1.25,
        repetition=raw_grid,
        annotations={'12': ['child-ref']},
        )
    public_ref = Ref(
        offset=(100, 200),
        rotation=numpy.pi / 2,
        mirrored=True,
        scale=1.25,
        repetition=public_grid,
        annotations={'12': ['child-ref']},
        )
    assert raw_ref == public_ref
    assert numpy.array_equal(raw_ref.as_transforms(), public_ref.as_transforms())
    raw_label = Label._from_raw(
        'LEAF',
        offset=numpy.array([3, 4]),
        annotations={'10': ['leaf-label']},
        )
    public_label = Label(
        'LEAF',
        offset=(3, 4),
        annotations={'10': ['leaf-label']},
        )
    assert raw_label == public_label
    assert numpy.array_equal(raw_label.get_bounds_single(), public_label.get_bounds_single())