[gdsii_arrow] further improvements to speed

This commit is contained in:
Jan Petykiewicz 2026-04-02 15:47:41 -07:00
commit 28562f73f6
3 changed files with 235 additions and 61 deletions

View file

@ -5,17 +5,18 @@ import importlib
import json
import time
from pathlib import Path
from typing import Any
from masque import LibraryError
READERS = {
'gdsii': ('masque.file.gdsii', 'readfile'),
'gdsii_arrow': ('masque.file.gdsii_arrow', 'readfile'),
READERS: dict[str, tuple[str, tuple[str, ...]]] = {
'gdsii': ('masque.file.gdsii', ('readfile',)),
'gdsii_arrow': ('masque.file.gdsii_arrow', ('readfile', 'arrow_import', 'arrow_convert')),
}
def _summarize(path: Path, elapsed_s: float, info: dict[str, object], lib: object) -> dict[str, object]:
def _summarize_library(path: Path, elapsed_s: float, info: dict[str, object], lib: object) -> dict[str, object]:
assert hasattr(lib, '__len__')
assert hasattr(lib, 'tops')
tops = lib.tops() # type: ignore[no-any-return, attr-defined]
@ -34,12 +35,50 @@ def _summarize(path: Path, elapsed_s: float, info: dict[str, object], lib: objec
}
def _summarize_arrow_import(path: Path, elapsed_s: float, arrow_arr: Any) -> dict[str, object]:
libarr = arrow_arr[0]
return {
'path': str(path),
'elapsed_s': elapsed_s,
'arrow_rows': len(arrow_arr),
'library_name': libarr['lib_name'].as_py(),
'cell_count': len(libarr['cells']),
'layer_count': len(libarr['layers']),
}
def _profile_stage(module: Any, stage: str, path: Path) -> dict[str, object]:
start = time.perf_counter()
if stage == 'readfile':
lib, info = module.readfile(path)
elapsed_s = time.perf_counter() - start
return _summarize_library(path, elapsed_s, info, lib)
if stage == 'arrow_import':
arrow_arr = module._read_to_arrow(path)
elapsed_s = time.perf_counter() - start
return _summarize_arrow_import(path, elapsed_s, arrow_arr)
if stage == 'arrow_convert':
arrow_arr = module._read_to_arrow(path)
libarr = arrow_arr[0]
start = time.perf_counter()
lib, info = module.read_arrow(libarr)
elapsed_s = time.perf_counter() - start
return _summarize_library(path, elapsed_s, info, lib)
raise ValueError(f'Unsupported stage {stage!r}')
def build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description='Profile GDS readers with a stable end-to-end workload.')
parser.add_argument('--reader', choices=sorted(READERS), required=True)
parser.add_argument('--stage', default='readfile')
parser.add_argument('--path', type=Path, required=True)
parser.add_argument('--warmup', type=int, default=1)
parser.add_argument('--repeat', type=int, default=1)
parser.add_argument('--output-json', type=Path)
return parser
@ -47,26 +86,32 @@ def main(argv: list[str] | None = None) -> int:
parser = build_arg_parser()
args = parser.parse_args(argv)
module_name, attr_name = READERS[args.reader]
readfile = getattr(importlib.import_module(module_name), attr_name)
module_name, stages = READERS[args.reader]
if args.stage not in stages:
parser.error(f'reader {args.reader!r} only supports stages: {", ".join(stages)}')
module = importlib.import_module(module_name)
path = args.path.expanduser().resolve()
for _ in range(args.warmup):
readfile(path)
_profile_stage(module, args.stage, path)
runs = []
for _ in range(args.repeat):
start = time.perf_counter()
lib, info = readfile(path)
elapsed_s = time.perf_counter() - start
runs.append(_summarize(path, elapsed_s, info, lib))
runs.append(_profile_stage(module, args.stage, path))
print(json.dumps({
payload = {
'reader': args.reader,
'stage': args.stage,
'warmup': args.warmup,
'repeat': args.repeat,
'runs': runs,
}, indent=2, sort_keys=True))
}
rendered = json.dumps(payload, indent=2, sort_keys=True)
if args.output_json is not None:
args.output_json.parent.mkdir(parents=True, exist_ok=True)
args.output_json.write_text(rendered + '\n')
print(rendered)
return 0

View file

@ -57,6 +57,8 @@ ffi.cdef('void read_path(char* path, struct ArrowArray* array, struct ArrowSchem
clib: Any | None = None
ZERO_OFFSET = numpy.zeros(2)
path_cap_map = {
0: Path.Cap.Flush,
@ -263,6 +265,29 @@ def read_arrow(
)
return elem
def get_boundary_batches(libarr: pyarrow.Array) -> dict[str, Any]:
batches = libarr['cells'].values.field('boundary_batches')
return dict(
offsets = batches.offsets.to_numpy(),
layer_inds = batches.values.field('layer').to_numpy(),
vert_arr = batches.values.field('vertices').values.to_numpy().reshape((-1, 2)),
vert_off = batches.values.field('vertices').offsets.to_numpy() // 2,
poly_off = batches.values.field('vertex_offsets').offsets.to_numpy(),
poly_offsets = batches.values.field('vertex_offsets').values.to_numpy(),
)
def get_boundary_props(libarr: pyarrow.Array) -> dict[str, Any]:
boundaries = libarr['cells'].values.field('boundary_props')
return dict(
offsets = boundaries.offsets.to_numpy(),
layer_inds = boundaries.values.field('layer').to_numpy(),
vert_arr = boundaries.values.field('vertices').values.to_numpy().reshape((-1, 2)),
vert_off = boundaries.values.field('vertices').offsets.to_numpy() // 2,
prop_off = boundaries.values.field('properties').offsets.to_numpy(),
prop_key = boundaries.values.field('properties').values.field('key').to_numpy(),
prop_val = boundaries.values.field('properties').values.field('value').to_pylist(),
)
rf = libarr['cells'].values.field('refs')
refs = dict(
offsets = rf.offsets.to_numpy(),
@ -292,10 +317,9 @@ def read_arrow(
)
elements = dict(
boundaries = get_geom(libarr, 'boundaries'),
boundary_batches = get_boundary_batches(libarr),
boundary_props = get_boundary_props(libarr),
paths = get_geom(libarr, 'paths'),
boxes = get_geom(libarr, 'boxes'),
nodes = get_geom(libarr, 'nodes'),
texts = texts,
refs = refs,
)
@ -320,7 +344,8 @@ def read_arrow(
for cc in range(len(libarr['cells'])):
name = cell_names[cell_ids[cc]]
pat = Pattern()
_boundaries_to_polygons(pat, global_args, elements['boundaries'], cc)
_boundary_batches_to_polygons(pat, global_args, elements['boundary_batches'], cc)
_boundary_props_to_polygons(pat, global_args, elements['boundary_props'], cc)
_gpaths_to_mpaths(pat, global_args, elements['paths'], cc)
_grefs_to_mrefs(pat, global_args, elements['refs'], cc)
_texts_to_labels(pat, global_args, elements['texts'], cc)
@ -366,6 +391,7 @@ def _grefs_to_mrefs(
elem_rep_xy1 = elem['rep_xy1'][elem_slc][:elem_count]
elem_rep_counts = elem['rep_counts'][elem_slc][:elem_count]
rep_valid = elem['rep_valid'][elem_slc][:elem_count]
raw_mode = global_args['raw_mode']
for ee in range(elem_count):
@ -380,9 +406,15 @@ def _grefs_to_mrefs(
a_vector = elem_rep_xy0[ee]
b_vector = elem_rep_xy1[ee]
a_count, b_count = elem_rep_counts[ee]
if raw_mode:
rep = Grid._from_raw(a_vector=a_vector, b_vector=b_vector, a_count=a_count, b_count=b_count)
else:
rep = Grid(a_vector=a_vector, b_vector=b_vector, a_count=a_count, b_count=b_count)
annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
if raw_mode:
ref = Ref._from_raw(offset=offset, mirrored=mirr, rotation=rot, scale=mag, repetition=rep, annotations=annotations)
else:
ref = Ref(offset=offset, mirrored=mirr, rotation=rot, scale=mag, repetition=rep, annotations=annotations)
pat.refs[target].append(ref)
@ -406,6 +438,7 @@ def _texts_to_labels(
elem_xy = xy[elem_slc][:elem_count]
elem_layer_inds = layer_inds[elem_slc][:elem_count]
elem_strings = elem['string'][elem_slc][:elem_count]
raw_mode = global_args['raw_mode']
for ee in range(elem_count):
layer = layer_tups[elem_layer_inds[ee]]
@ -413,6 +446,9 @@ def _texts_to_labels(
string = elem_strings[ee]
annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
if raw_mode:
mlabel = Label._from_raw(string=string, offset=offset, annotations=annotations)
else:
mlabel = Label(string=string, offset=offset, annotations=annotations)
pat.labels[layer].append(mlabel)
@ -439,7 +475,6 @@ def _gpaths_to_mpaths(
elem_path_types = elem['path_type'][elem_slc][:elem_count]
elem_extensions = elem['extensions'][elem_slc][:elem_count]
zeros = numpy.zeros((elem_count, 2))
raw_mode = global_args['raw_mode']
for ee in range(elem_count):
layer = layer_tups[elem_layer_inds[ee]]
@ -453,64 +488,77 @@ def _gpaths_to_mpaths(
cap_extensions = None
annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
path = Path(vertices=vertices, offset=zeros[ee], annotations=annotations, raw=raw_mode,
path = Path(vertices=vertices, offset=ZERO_OFFSET, annotations=annotations, raw=raw_mode,
width=width, cap=cap,cap_extensions=cap_extensions)
pat.shapes[layer].append(path)
def _boundaries_to_polygons(
def _boundary_batches_to_polygons(
pat: Pattern,
global_args: dict[str, Any],
elem: dict[str, Any],
cc: int,
) -> None:
elem_off = elem['offsets'] # which elements belong to each cell
xy_val = elem['xy_arr']
vert_arr = elem['vert_arr']
vert_off = elem['vert_off']
layer_inds = elem['layer_inds']
layer_tups = global_args['layer_tups']
poly_off = elem['poly_off']
poly_offsets = elem['poly_offsets']
batch_count = elem_off[cc + 1] - elem_off[cc]
if batch_count == 0:
return
elem_slc = slice(elem_off[cc], elem_off[cc] + batch_count + 1) # +1 to capture ending location for last elem
elem_vert_off = vert_off[elem_slc]
elem_poly_off = poly_off[elem_slc]
elem_layer_inds = layer_inds[elem_slc][:batch_count]
raw_mode = global_args['raw_mode']
for bb in range(batch_count):
layer = layer_tups[elem_layer_inds[bb]]
vertices = vert_arr[elem_vert_off[bb]:elem_vert_off[bb + 1]]
vertex_offsets = numpy.asarray(poly_offsets[elem_poly_off[bb]:elem_poly_off[bb + 1]], dtype=numpy.intp)
if vertex_offsets.size == 1:
poly = Polygon(vertices=vertices, offset=ZERO_OFFSET, annotations=None, raw=raw_mode)
pat.shapes[layer].append(poly)
else:
polys = PolyCollection(vertex_lists=vertices, vertex_offsets=vertex_offsets, offset=ZERO_OFFSET, annotations=None, raw=raw_mode)
pat.shapes[layer].append(polys)
def _boundary_props_to_polygons(
pat: Pattern,
global_args: dict[str, Any],
elem: dict[str, Any],
cc: int,
) -> None:
elem_off = elem['offsets']
vert_arr = elem['vert_arr']
vert_off = elem['vert_off']
layer_inds = elem['layer_inds']
layer_tups = global_args['layer_tups']
prop_key = elem['prop_key']
prop_val = elem['prop_val']
elem_count = elem_off[cc + 1] - elem_off[cc]
elem_slc = slice(elem_off[cc], elem_off[cc] + elem_count + 1) # +1 to capture ending location for last elem
xy_offs = elem['xy_off'][elem_slc] # which xy coords belong to each element
xy_counts = xy_offs[1:] - xy_offs[:-1]
prop_offs = elem['prop_off'][elem_slc] # which props belong to each element
prop_counts = prop_offs[1:] - prop_offs[:-1]
if elem_count == 0:
return
elem_slc = slice(elem_off[cc], elem_off[cc] + elem_count + 1)
elem_vert_off = vert_off[elem_slc]
prop_offs = elem['prop_off'][elem_slc]
elem_layer_inds = layer_inds[elem_slc][:elem_count]
order = numpy.argsort(elem_layer_inds, stable=True)
unilayer_inds, unilayer_first, unilayer_count = numpy.unique(elem_layer_inds, return_index=True, return_counts=True)
zeros = numpy.zeros((elem_count, 2))
raw_mode = global_args['raw_mode']
for layer_ind, ff, nn in zip(unilayer_inds, unilayer_first, unilayer_count, strict=True):
ee_inds = order[ff:ff + nn]
layer = layer_tups[layer_ind]
propless_mask = prop_counts[ee_inds] == 0
poly_count_on_layer = propless_mask.sum()
if poly_count_on_layer == 1:
propless_mask[:] = 0 # Never make a 1-element collection
elif poly_count_on_layer > 1:
propless_vert_counts = xy_counts[ee_inds[propless_mask]] - 1 # -1 to drop closing point
vertex_lists = numpy.empty((propless_vert_counts.sum(), 2), dtype=numpy.float64)
vertex_offsets = numpy.cumsum(numpy.concatenate([[0], propless_vert_counts]))
for ii, ee in enumerate(ee_inds[propless_mask]):
vo = vertex_offsets[ii]
vertex_lists[vo:vo + propless_vert_counts[ii]] = xy_val[xy_offs[ee]:xy_offs[ee + 1] - 1]
polys = PolyCollection(vertex_lists=vertex_lists, vertex_offsets=vertex_offsets, offset=zeros[ee])
pat.shapes[layer].append(polys)
# Handle single polygons
for ee in ee_inds[~propless_mask]:
for ee in range(elem_count):
layer = layer_tups[elem_layer_inds[ee]]
vertices = xy_val[xy_offs[ee]:xy_offs[ee + 1] - 1] # -1 to drop closing point
vertices = vert_arr[elem_vert_off[ee]:elem_vert_off[ee + 1]]
annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
poly = Polygon(vertices=vertices, offset=zeros[ee], annotations=annotations, raw=raw_mode)
poly = Polygon(vertices=vertices, offset=ZERO_OFFSET, annotations=annotations, raw=raw_mode)
pat.shapes[layer].append(poly)

View file

@ -5,6 +5,7 @@ import pytest
pytest.importorskip('pyarrow')
from .. import Ref, Label
from ..library import Library
from ..pattern import Pattern
from ..repetition import Grid
@ -119,7 +120,10 @@ def _make_arrow_test_library() -> Library:
leaf = Pattern()
leaf.polygon((1, 0), vertices=[[0, 0], [10, 0], [10, 10], [0, 10]], annotations={'1': ['leaf-poly']})
leaf.polygon((2, 0), vertices=[[40, 0], [50, 0], [50, 10], [40, 10]])
leaf.polygon((1, 0), vertices=[[20, 0], [30, 0], [30, 10], [20, 10]])
leaf.polygon((1, 0), vertices=[[80, 0], [90, 0], [90, 10], [80, 10]])
leaf.polygon((2, 0), vertices=[[60, 0], [70, 0], [70, 10], [60, 10]], annotations={'18': ['leaf-poly-2']})
leaf.label((10, 0), string='LEAF', offset=(3, 4), annotations={'10': ['leaf-label']})
lib['leaf'] = leaf
@ -178,3 +182,80 @@ def test_gdsii_arrow_reads_small_perf_fixture(tmp_path: Path) -> None:
assert len(lib) == manifest.cells
assert 'TOP' in lib
assert sum(len(refs) for refs in lib['TOP'].refs.values()) > 0
def test_gdsii_arrow_boundary_batch_schema(tmp_path: Path) -> None:
lib = _make_arrow_test_library()
gds_file = tmp_path / 'arrow_batches.gds'
gdsii.writefile(lib, gds_file, meters_per_unit=1e-9)
libarr = gdsii_arrow._read_to_arrow(gds_file)[0]
cells = libarr['cells'].values
cell_ids = cells.field('id').to_numpy()
cell_names = libarr['cell_names'].as_py()
layer_table = [
((int(layer) >> 16) & 0xFFFF, int(layer) & 0xFFFF)
for layer in libarr['layers'].values.to_numpy()
]
leaf_index = next(ii for ii, cell_id in enumerate(cell_ids) if cell_names[cell_id] == 'leaf')
boundary_batches = cells.field('boundary_batches')[leaf_index].as_py()
boundary_props = cells.field('boundary_props')[leaf_index].as_py()
assert len(boundary_batches) == 2
assert len(boundary_props) == 2
batch_by_layer = {tuple(layer_table[entry['layer']]): entry for entry in boundary_batches}
assert batch_by_layer[(1, 0)]['vertex_offsets'] == [0, 4]
assert len(batch_by_layer[(1, 0)]['vertices']) == 16
assert batch_by_layer[(2, 0)]['vertex_offsets'] == [0]
assert len(batch_by_layer[(2, 0)]['vertices']) == 8
props_by_layer = {tuple(layer_table[entry['layer']]): entry for entry in boundary_props}
assert sorted(props_by_layer) == [(1, 0), (2, 0)]
assert props_by_layer[(1, 0)]['properties'][0]['value'] == 'leaf-poly'
assert props_by_layer[(2, 0)]['properties'][0]['value'] == 'leaf-poly-2'
def test_raw_ref_grid_label_constructors_match_public() -> None:
raw_grid = Grid._from_raw(
a_vector=numpy.array([20, 0]),
a_count=3,
b_vector=numpy.array([0, 30]),
b_count=2,
)
public_grid = Grid(a_vector=(20, 0), a_count=3, b_vector=(0, 30), b_count=2)
assert raw_grid == public_grid
raw_ref = Ref._from_raw(
offset=numpy.array([100, 200]),
rotation=numpy.pi / 2,
mirrored=True,
scale=1.25,
repetition=raw_grid,
annotations={'12': ['child-ref']},
)
public_ref = Ref(
offset=(100, 200),
rotation=numpy.pi / 2,
mirrored=True,
scale=1.25,
repetition=public_grid,
annotations={'12': ['child-ref']},
)
assert raw_ref == public_ref
assert numpy.array_equal(raw_ref.as_transforms(), public_ref.as_transforms())
raw_label = Label._from_raw(
'LEAF',
offset=numpy.array([3, 4]),
annotations={'10': ['leaf-label']},
)
public_label = Label(
'LEAF',
offset=(3, 4),
annotations={'10': ['leaf-label']},
)
assert raw_label == public_label
assert numpy.array_equal(raw_label.get_bounds_single(), public_label.get_bounds_single())