[gdsii_arrow] further improvements to speed

2026-04-02 15:47:41 -07:00 · 2026-04-02 15:47:41 -07:00 · 28562f73f6
commit 28562f73f6
parent d387066228
3 changed files with 235 additions and 61 deletions
--- a/examples/profile_gdsii_readers.py
+++ b/examples/profile_gdsii_readers.py
@ -5,17 +5,18 @@ import importlib
 import json
 import time
 from pathlib import Path
+from typing import Any

 from masque import LibraryError


-READERS = {
-    'gdsii': ('masque.file.gdsii', 'readfile'),
-    'gdsii_arrow': ('masque.file.gdsii_arrow', 'readfile'),
+READERS: dict[str, tuple[str, tuple[str, ...]]] = {
+    'gdsii': ('masque.file.gdsii', ('readfile',)),
+    'gdsii_arrow': ('masque.file.gdsii_arrow', ('readfile', 'arrow_import', 'arrow_convert')),
    }


-def _summarize(path: Path, elapsed_s: float, info: dict[str, object], lib: object) -> dict[str, object]:
+def _summarize_library(path: Path, elapsed_s: float, info: dict[str, object], lib: object) -> dict[str, object]:
    assert hasattr(lib, '__len__')
    assert hasattr(lib, 'tops')
    tops = lib.tops()      # type: ignore[no-any-return, attr-defined]
@ -34,12 +35,50 @@ def _summarize(path: Path, elapsed_s: float, info: dict[str, object], lib: objec
        }


+def _summarize_arrow_import(path: Path, elapsed_s: float, arrow_arr: Any) -> dict[str, object]:
+    libarr = arrow_arr[0]
+    return {
+        'path': str(path),
+        'elapsed_s': elapsed_s,
+        'arrow_rows': len(arrow_arr),
+        'library_name': libarr['lib_name'].as_py(),
+        'cell_count': len(libarr['cells']),
+        'layer_count': len(libarr['layers']),
+        }
+
+
+def _profile_stage(module: Any, stage: str, path: Path) -> dict[str, object]:
+    start = time.perf_counter()
+
+    if stage == 'readfile':
+        lib, info = module.readfile(path)
+        elapsed_s = time.perf_counter() - start
+        return _summarize_library(path, elapsed_s, info, lib)
+
+    if stage == 'arrow_import':
+        arrow_arr = module._read_to_arrow(path)
+        elapsed_s = time.perf_counter() - start
+        return _summarize_arrow_import(path, elapsed_s, arrow_arr)
+
+    if stage == 'arrow_convert':
+        arrow_arr = module._read_to_arrow(path)
+        libarr = arrow_arr[0]
+        start = time.perf_counter()
+        lib, info = module.read_arrow(libarr)
+        elapsed_s = time.perf_counter() - start
+        return _summarize_library(path, elapsed_s, info, lib)
+
+    raise ValueError(f'Unsupported stage {stage!r}')
+
+
 def build_arg_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description='Profile GDS readers with a stable end-to-end workload.')
    parser.add_argument('--reader', choices=sorted(READERS), required=True)
+    parser.add_argument('--stage', default='readfile')
    parser.add_argument('--path', type=Path, required=True)
    parser.add_argument('--warmup', type=int, default=1)
    parser.add_argument('--repeat', type=int, default=1)
+    parser.add_argument('--output-json', type=Path)
    return parser


@ -47,26 +86,32 @@ def main(argv: list[str] | None = None) -> int:
    parser = build_arg_parser()
    args = parser.parse_args(argv)

-    module_name, attr_name = READERS[args.reader]
-    readfile = getattr(importlib.import_module(module_name), attr_name)
+    module_name, stages = READERS[args.reader]
+    if args.stage not in stages:
+        parser.error(f'reader {args.reader!r} only supports stages: {", ".join(stages)}')
+
+    module = importlib.import_module(module_name)
    path = args.path.expanduser().resolve()

    for _ in range(args.warmup):
-        readfile(path)
+        _profile_stage(module, args.stage, path)

    runs = []
    for _ in range(args.repeat):
-        start = time.perf_counter()
-        lib, info = readfile(path)
-        elapsed_s = time.perf_counter() - start
-        runs.append(_summarize(path, elapsed_s, info, lib))
+        runs.append(_profile_stage(module, args.stage, path))

-    print(json.dumps({
+    payload = {
        'reader': args.reader,
+        'stage': args.stage,
        'warmup': args.warmup,
        'repeat': args.repeat,
        'runs': runs,
-        }, indent=2, sort_keys=True))
+        }
+    rendered = json.dumps(payload, indent=2, sort_keys=True)
+    if args.output_json is not None:
+        args.output_json.parent.mkdir(parents=True, exist_ok=True)
+        args.output_json.write_text(rendered + '\n')
+    print(rendered)
    return 0


--- a/masque/file/gdsii_arrow.py
+++ b/masque/file/gdsii_arrow.py
@ -57,6 +57,8 @@ ffi.cdef('void read_path(char* path, struct ArrowArray* array, struct ArrowSchem

 clib: Any | None = None

+ZERO_OFFSET = numpy.zeros(2)
+

 path_cap_map = {
    0: Path.Cap.Flush,
@ -263,6 +265,29 @@ def read_arrow(
            )
        return elem

+    def get_boundary_batches(libarr: pyarrow.Array) -> dict[str, Any]:
+        batches = libarr['cells'].values.field('boundary_batches')
+        return dict(
+            offsets = batches.offsets.to_numpy(),
+            layer_inds = batches.values.field('layer').to_numpy(),
+            vert_arr = batches.values.field('vertices').values.to_numpy().reshape((-1, 2)),
+            vert_off = batches.values.field('vertices').offsets.to_numpy() // 2,
+            poly_off = batches.values.field('vertex_offsets').offsets.to_numpy(),
+            poly_offsets = batches.values.field('vertex_offsets').values.to_numpy(),
+            )
+
+    def get_boundary_props(libarr: pyarrow.Array) -> dict[str, Any]:
+        boundaries = libarr['cells'].values.field('boundary_props')
+        return dict(
+            offsets = boundaries.offsets.to_numpy(),
+            layer_inds = boundaries.values.field('layer').to_numpy(),
+            vert_arr = boundaries.values.field('vertices').values.to_numpy().reshape((-1, 2)),
+            vert_off = boundaries.values.field('vertices').offsets.to_numpy() // 2,
+            prop_off = boundaries.values.field('properties').offsets.to_numpy(),
+            prop_key = boundaries.values.field('properties').values.field('key').to_numpy(),
+            prop_val = boundaries.values.field('properties').values.field('value').to_pylist(),
+            )
+
    rf = libarr['cells'].values.field('refs')
    refs = dict(
        offsets = rf.offsets.to_numpy(),
@ -292,10 +317,9 @@ def read_arrow(
        )

    elements = dict(
-        boundaries = get_geom(libarr, 'boundaries'),
+        boundary_batches = get_boundary_batches(libarr),
+        boundary_props = get_boundary_props(libarr),
        paths = get_geom(libarr, 'paths'),
-        boxes = get_geom(libarr, 'boxes'),
-        nodes = get_geom(libarr, 'nodes'),
        texts = texts,
        refs = refs,
        )
@ -320,7 +344,8 @@ def read_arrow(
    for cc in range(len(libarr['cells'])):
        name = cell_names[cell_ids[cc]]
        pat = Pattern()
-        _boundaries_to_polygons(pat, global_args, elements['boundaries'], cc)
+        _boundary_batches_to_polygons(pat, global_args, elements['boundary_batches'], cc)
+        _boundary_props_to_polygons(pat, global_args, elements['boundary_props'], cc)
        _gpaths_to_mpaths(pat, global_args, elements['paths'], cc)
        _grefs_to_mrefs(pat, global_args, elements['refs'], cc)
        _texts_to_labels(pat, global_args, elements['texts'], cc)
@ -366,6 +391,7 @@ def _grefs_to_mrefs(
    elem_rep_xy1 = elem['rep_xy1'][elem_slc][:elem_count]
    elem_rep_counts = elem['rep_counts'][elem_slc][:elem_count]
    rep_valid = elem['rep_valid'][elem_slc][:elem_count]
+    raw_mode = global_args['raw_mode']


    for ee in range(elem_count):
@ -380,9 +406,15 @@ def _grefs_to_mrefs(
            a_vector = elem_rep_xy0[ee]
            b_vector = elem_rep_xy1[ee]
            a_count, b_count = elem_rep_counts[ee]
+            if raw_mode:
+                rep = Grid._from_raw(a_vector=a_vector, b_vector=b_vector, a_count=a_count, b_count=b_count)
+            else:
                rep = Grid(a_vector=a_vector, b_vector=b_vector, a_count=a_count, b_count=b_count)

        annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
+        if raw_mode:
+            ref = Ref._from_raw(offset=offset, mirrored=mirr, rotation=rot, scale=mag, repetition=rep, annotations=annotations)
+        else:
            ref = Ref(offset=offset, mirrored=mirr, rotation=rot, scale=mag, repetition=rep, annotations=annotations)
        pat.refs[target].append(ref)

@ -406,6 +438,7 @@ def _texts_to_labels(
    elem_xy = xy[elem_slc][:elem_count]
    elem_layer_inds = layer_inds[elem_slc][:elem_count]
    elem_strings = elem['string'][elem_slc][:elem_count]
+    raw_mode = global_args['raw_mode']

    for ee in range(elem_count):
        layer = layer_tups[elem_layer_inds[ee]]
@ -413,6 +446,9 @@ def _texts_to_labels(
        string = elem_strings[ee]

        annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
+        if raw_mode:
+            mlabel = Label._from_raw(string=string, offset=offset, annotations=annotations)
+        else:
            mlabel = Label(string=string, offset=offset, annotations=annotations)
        pat.labels[layer].append(mlabel)

@ -439,7 +475,6 @@ def _gpaths_to_mpaths(
    elem_path_types = elem['path_type'][elem_slc][:elem_count]
    elem_extensions = elem['extensions'][elem_slc][:elem_count]

-    zeros = numpy.zeros((elem_count, 2))
    raw_mode = global_args['raw_mode']
    for ee in range(elem_count):
        layer = layer_tups[elem_layer_inds[ee]]
@ -453,64 +488,77 @@ def _gpaths_to_mpaths(
            cap_extensions = None

        annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
-        path = Path(vertices=vertices, offset=zeros[ee], annotations=annotations, raw=raw_mode,
+        path = Path(vertices=vertices, offset=ZERO_OFFSET, annotations=annotations, raw=raw_mode,
            width=width, cap=cap,cap_extensions=cap_extensions)
        pat.shapes[layer].append(path)


-def _boundaries_to_polygons(
+def _boundary_batches_to_polygons(
        pat: Pattern,
        global_args: dict[str, Any],
        elem: dict[str, Any],
        cc: int,
        ) -> None:
    elem_off = elem['offsets']      # which elements belong to each cell
-    xy_val = elem['xy_arr']
+    vert_arr = elem['vert_arr']
+    vert_off = elem['vert_off']
+    layer_inds = elem['layer_inds']
+    layer_tups = global_args['layer_tups']
+    poly_off = elem['poly_off']
+    poly_offsets = elem['poly_offsets']
+
+    batch_count = elem_off[cc + 1] - elem_off[cc]
+    if batch_count == 0:
+        return
+
+    elem_slc = slice(elem_off[cc], elem_off[cc] + batch_count + 1)   # +1 to capture ending location for last elem
+    elem_vert_off = vert_off[elem_slc]
+    elem_poly_off = poly_off[elem_slc]
+    elem_layer_inds = layer_inds[elem_slc][:batch_count]
+
+    raw_mode = global_args['raw_mode']
+    for bb in range(batch_count):
+        layer = layer_tups[elem_layer_inds[bb]]
+        vertices = vert_arr[elem_vert_off[bb]:elem_vert_off[bb + 1]]
+        vertex_offsets = numpy.asarray(poly_offsets[elem_poly_off[bb]:elem_poly_off[bb + 1]], dtype=numpy.intp)
+
+        if vertex_offsets.size == 1:
+            poly = Polygon(vertices=vertices, offset=ZERO_OFFSET, annotations=None, raw=raw_mode)
+            pat.shapes[layer].append(poly)
+        else:
+            polys = PolyCollection(vertex_lists=vertices, vertex_offsets=vertex_offsets, offset=ZERO_OFFSET, annotations=None, raw=raw_mode)
+            pat.shapes[layer].append(polys)
+
+
+def _boundary_props_to_polygons(
+        pat: Pattern,
+        global_args: dict[str, Any],
+        elem: dict[str, Any],
+        cc: int,
+        ) -> None:
+    elem_off = elem['offsets']
+    vert_arr = elem['vert_arr']
+    vert_off = elem['vert_off']
    layer_inds = elem['layer_inds']
    layer_tups = global_args['layer_tups']
    prop_key = elem['prop_key']
    prop_val = elem['prop_val']

    elem_count = elem_off[cc + 1] - elem_off[cc]
-    elem_slc = slice(elem_off[cc], elem_off[cc] + elem_count + 1)   # +1 to capture ending location for last elem
-    xy_offs = elem['xy_off'][elem_slc]      # which xy coords belong to each element
-    xy_counts = xy_offs[1:] - xy_offs[:-1]
-    prop_offs = elem['prop_off'][elem_slc]  # which props belong to each element
-    prop_counts = prop_offs[1:] - prop_offs[:-1]
+    if elem_count == 0:
+        return
+
+    elem_slc = slice(elem_off[cc], elem_off[cc] + elem_count + 1)
+    elem_vert_off = vert_off[elem_slc]
+    prop_offs = elem['prop_off'][elem_slc]
    elem_layer_inds = layer_inds[elem_slc][:elem_count]

-    order = numpy.argsort(elem_layer_inds, stable=True)
-    unilayer_inds, unilayer_first, unilayer_count = numpy.unique(elem_layer_inds, return_index=True, return_counts=True)
-
-    zeros = numpy.zeros((elem_count, 2))
    raw_mode = global_args['raw_mode']
-    for layer_ind, ff, nn in zip(unilayer_inds, unilayer_first, unilayer_count, strict=True):
-        ee_inds = order[ff:ff + nn]
-        layer = layer_tups[layer_ind]
-        propless_mask = prop_counts[ee_inds] == 0
-
-        poly_count_on_layer = propless_mask.sum()
-        if poly_count_on_layer == 1:
-            propless_mask[:] = 0        # Never make a 1-element collection
-        elif poly_count_on_layer > 1:
-            propless_vert_counts = xy_counts[ee_inds[propless_mask]] - 1        # -1 to drop closing point
-            vertex_lists = numpy.empty((propless_vert_counts.sum(), 2), dtype=numpy.float64)
-            vertex_offsets = numpy.cumsum(numpy.concatenate([[0], propless_vert_counts]))
-
-            for ii, ee in enumerate(ee_inds[propless_mask]):
-                vo = vertex_offsets[ii]
-                vertex_lists[vo:vo + propless_vert_counts[ii]] = xy_val[xy_offs[ee]:xy_offs[ee + 1] - 1]
-
-            polys = PolyCollection(vertex_lists=vertex_lists, vertex_offsets=vertex_offsets, offset=zeros[ee])
-            pat.shapes[layer].append(polys)
-
-        # Handle single polygons
-        for ee in ee_inds[~propless_mask]:
+    for ee in range(elem_count):
        layer = layer_tups[elem_layer_inds[ee]]
-            vertices = xy_val[xy_offs[ee]:xy_offs[ee + 1] - 1]    # -1 to drop closing point
-
+        vertices = vert_arr[elem_vert_off[ee]:elem_vert_off[ee + 1]]
        annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
-            poly = Polygon(vertices=vertices, offset=zeros[ee], annotations=annotations, raw=raw_mode)
+        poly = Polygon(vertices=vertices, offset=ZERO_OFFSET, annotations=annotations, raw=raw_mode)
        pat.shapes[layer].append(poly)


--- a/masque/test/test_gdsii_arrow.py
+++ b/masque/test/test_gdsii_arrow.py
@ -5,6 +5,7 @@ import pytest

 pytest.importorskip('pyarrow')

+from .. import Ref, Label
 from ..library import Library
 from ..pattern import Pattern
 from ..repetition import Grid
@ -119,7 +120,10 @@ def _make_arrow_test_library() -> Library:

    leaf = Pattern()
    leaf.polygon((1, 0), vertices=[[0, 0], [10, 0], [10, 10], [0, 10]], annotations={'1': ['leaf-poly']})
+    leaf.polygon((2, 0), vertices=[[40, 0], [50, 0], [50, 10], [40, 10]])
    leaf.polygon((1, 0), vertices=[[20, 0], [30, 0], [30, 10], [20, 10]])
+    leaf.polygon((1, 0), vertices=[[80, 0], [90, 0], [90, 10], [80, 10]])
+    leaf.polygon((2, 0), vertices=[[60, 0], [70, 0], [70, 10], [60, 10]], annotations={'18': ['leaf-poly-2']})
    leaf.label((10, 0), string='LEAF', offset=(3, 4), annotations={'10': ['leaf-label']})
    lib['leaf'] = leaf

@ -178,3 +182,80 @@ def test_gdsii_arrow_reads_small_perf_fixture(tmp_path: Path) -> None:
    assert len(lib) == manifest.cells
    assert 'TOP' in lib
    assert sum(len(refs) for refs in lib['TOP'].refs.values()) > 0
+
+
+def test_gdsii_arrow_boundary_batch_schema(tmp_path: Path) -> None:
+    lib = _make_arrow_test_library()
+    gds_file = tmp_path / 'arrow_batches.gds'
+    gdsii.writefile(lib, gds_file, meters_per_unit=1e-9)
+
+    libarr = gdsii_arrow._read_to_arrow(gds_file)[0]
+    cells = libarr['cells'].values
+    cell_ids = cells.field('id').to_numpy()
+    cell_names = libarr['cell_names'].as_py()
+    layer_table = [
+        ((int(layer) >> 16) & 0xFFFF, int(layer) & 0xFFFF)
+        for layer in libarr['layers'].values.to_numpy()
+        ]
+
+    leaf_index = next(ii for ii, cell_id in enumerate(cell_ids) if cell_names[cell_id] == 'leaf')
+
+    boundary_batches = cells.field('boundary_batches')[leaf_index].as_py()
+    boundary_props = cells.field('boundary_props')[leaf_index].as_py()
+
+    assert len(boundary_batches) == 2
+    assert len(boundary_props) == 2
+
+    batch_by_layer = {tuple(layer_table[entry['layer']]): entry for entry in boundary_batches}
+    assert batch_by_layer[(1, 0)]['vertex_offsets'] == [0, 4]
+    assert len(batch_by_layer[(1, 0)]['vertices']) == 16
+    assert batch_by_layer[(2, 0)]['vertex_offsets'] == [0]
+    assert len(batch_by_layer[(2, 0)]['vertices']) == 8
+
+    props_by_layer = {tuple(layer_table[entry['layer']]): entry for entry in boundary_props}
+    assert sorted(props_by_layer) == [(1, 0), (2, 0)]
+    assert props_by_layer[(1, 0)]['properties'][0]['value'] == 'leaf-poly'
+    assert props_by_layer[(2, 0)]['properties'][0]['value'] == 'leaf-poly-2'
+
+
+def test_raw_ref_grid_label_constructors_match_public() -> None:
+    raw_grid = Grid._from_raw(
+        a_vector=numpy.array([20, 0]),
+        a_count=3,
+        b_vector=numpy.array([0, 30]),
+        b_count=2,
+        )
+    public_grid = Grid(a_vector=(20, 0), a_count=3, b_vector=(0, 30), b_count=2)
+    assert raw_grid == public_grid
+
+    raw_ref = Ref._from_raw(
+        offset=numpy.array([100, 200]),
+        rotation=numpy.pi / 2,
+        mirrored=True,
+        scale=1.25,
+        repetition=raw_grid,
+        annotations={'12': ['child-ref']},
+        )
+    public_ref = Ref(
+        offset=(100, 200),
+        rotation=numpy.pi / 2,
+        mirrored=True,
+        scale=1.25,
+        repetition=public_grid,
+        annotations={'12': ['child-ref']},
+        )
+    assert raw_ref == public_ref
+    assert numpy.array_equal(raw_ref.as_transforms(), public_ref.as_transforms())
+
+    raw_label = Label._from_raw(
+        'LEAF',
+        offset=numpy.array([3, 4]),
+        annotations={'10': ['leaf-label']},
+        )
+    public_label = Label(
+        'LEAF',
+        offset=(3, 4),
+        annotations={'10': ['leaf-label']},
+        )
+    assert raw_label == public_label
+    assert numpy.array_equal(raw_label.get_bounds_single(), public_label.get_bounds_single())