[gdsii_arrow] further improvements to speed

This commit is contained in:
Jan Petykiewicz 2026-04-02 15:47:41 -07:00
commit 28562f73f6
3 changed files with 235 additions and 61 deletions

View file

@ -5,17 +5,18 @@ import importlib
import json import json
import time import time
from pathlib import Path from pathlib import Path
from typing import Any
from masque import LibraryError from masque import LibraryError
READERS = { READERS: dict[str, tuple[str, tuple[str, ...]]] = {
'gdsii': ('masque.file.gdsii', 'readfile'), 'gdsii': ('masque.file.gdsii', ('readfile',)),
'gdsii_arrow': ('masque.file.gdsii_arrow', 'readfile'), 'gdsii_arrow': ('masque.file.gdsii_arrow', ('readfile', 'arrow_import', 'arrow_convert')),
} }
def _summarize(path: Path, elapsed_s: float, info: dict[str, object], lib: object) -> dict[str, object]: def _summarize_library(path: Path, elapsed_s: float, info: dict[str, object], lib: object) -> dict[str, object]:
assert hasattr(lib, '__len__') assert hasattr(lib, '__len__')
assert hasattr(lib, 'tops') assert hasattr(lib, 'tops')
tops = lib.tops() # type: ignore[no-any-return, attr-defined] tops = lib.tops() # type: ignore[no-any-return, attr-defined]
@ -34,12 +35,50 @@ def _summarize(path: Path, elapsed_s: float, info: dict[str, object], lib: objec
} }
def _summarize_arrow_import(path: Path, elapsed_s: float, arrow_arr: Any) -> dict[str, object]:
libarr = arrow_arr[0]
return {
'path': str(path),
'elapsed_s': elapsed_s,
'arrow_rows': len(arrow_arr),
'library_name': libarr['lib_name'].as_py(),
'cell_count': len(libarr['cells']),
'layer_count': len(libarr['layers']),
}
def _profile_stage(module: Any, stage: str, path: Path) -> dict[str, object]:
start = time.perf_counter()
if stage == 'readfile':
lib, info = module.readfile(path)
elapsed_s = time.perf_counter() - start
return _summarize_library(path, elapsed_s, info, lib)
if stage == 'arrow_import':
arrow_arr = module._read_to_arrow(path)
elapsed_s = time.perf_counter() - start
return _summarize_arrow_import(path, elapsed_s, arrow_arr)
if stage == 'arrow_convert':
arrow_arr = module._read_to_arrow(path)
libarr = arrow_arr[0]
start = time.perf_counter()
lib, info = module.read_arrow(libarr)
elapsed_s = time.perf_counter() - start
return _summarize_library(path, elapsed_s, info, lib)
raise ValueError(f'Unsupported stage {stage!r}')
def build_arg_parser() -> argparse.ArgumentParser: def build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description='Profile GDS readers with a stable end-to-end workload.') parser = argparse.ArgumentParser(description='Profile GDS readers with a stable end-to-end workload.')
parser.add_argument('--reader', choices=sorted(READERS), required=True) parser.add_argument('--reader', choices=sorted(READERS), required=True)
parser.add_argument('--stage', default='readfile')
parser.add_argument('--path', type=Path, required=True) parser.add_argument('--path', type=Path, required=True)
parser.add_argument('--warmup', type=int, default=1) parser.add_argument('--warmup', type=int, default=1)
parser.add_argument('--repeat', type=int, default=1) parser.add_argument('--repeat', type=int, default=1)
parser.add_argument('--output-json', type=Path)
return parser return parser
@ -47,26 +86,32 @@ def main(argv: list[str] | None = None) -> int:
parser = build_arg_parser() parser = build_arg_parser()
args = parser.parse_args(argv) args = parser.parse_args(argv)
module_name, attr_name = READERS[args.reader] module_name, stages = READERS[args.reader]
readfile = getattr(importlib.import_module(module_name), attr_name) if args.stage not in stages:
parser.error(f'reader {args.reader!r} only supports stages: {", ".join(stages)}')
module = importlib.import_module(module_name)
path = args.path.expanduser().resolve() path = args.path.expanduser().resolve()
for _ in range(args.warmup): for _ in range(args.warmup):
readfile(path) _profile_stage(module, args.stage, path)
runs = [] runs = []
for _ in range(args.repeat): for _ in range(args.repeat):
start = time.perf_counter() runs.append(_profile_stage(module, args.stage, path))
lib, info = readfile(path)
elapsed_s = time.perf_counter() - start
runs.append(_summarize(path, elapsed_s, info, lib))
print(json.dumps({ payload = {
'reader': args.reader, 'reader': args.reader,
'stage': args.stage,
'warmup': args.warmup, 'warmup': args.warmup,
'repeat': args.repeat, 'repeat': args.repeat,
'runs': runs, 'runs': runs,
}, indent=2, sort_keys=True)) }
rendered = json.dumps(payload, indent=2, sort_keys=True)
if args.output_json is not None:
args.output_json.parent.mkdir(parents=True, exist_ok=True)
args.output_json.write_text(rendered + '\n')
print(rendered)
return 0 return 0

View file

@ -57,6 +57,8 @@ ffi.cdef('void read_path(char* path, struct ArrowArray* array, struct ArrowSchem
clib: Any | None = None clib: Any | None = None
ZERO_OFFSET = numpy.zeros(2)
path_cap_map = { path_cap_map = {
0: Path.Cap.Flush, 0: Path.Cap.Flush,
@ -263,6 +265,29 @@ def read_arrow(
) )
return elem return elem
def get_boundary_batches(libarr: pyarrow.Array) -> dict[str, Any]:
batches = libarr['cells'].values.field('boundary_batches')
return dict(
offsets = batches.offsets.to_numpy(),
layer_inds = batches.values.field('layer').to_numpy(),
vert_arr = batches.values.field('vertices').values.to_numpy().reshape((-1, 2)),
vert_off = batches.values.field('vertices').offsets.to_numpy() // 2,
poly_off = batches.values.field('vertex_offsets').offsets.to_numpy(),
poly_offsets = batches.values.field('vertex_offsets').values.to_numpy(),
)
def get_boundary_props(libarr: pyarrow.Array) -> dict[str, Any]:
boundaries = libarr['cells'].values.field('boundary_props')
return dict(
offsets = boundaries.offsets.to_numpy(),
layer_inds = boundaries.values.field('layer').to_numpy(),
vert_arr = boundaries.values.field('vertices').values.to_numpy().reshape((-1, 2)),
vert_off = boundaries.values.field('vertices').offsets.to_numpy() // 2,
prop_off = boundaries.values.field('properties').offsets.to_numpy(),
prop_key = boundaries.values.field('properties').values.field('key').to_numpy(),
prop_val = boundaries.values.field('properties').values.field('value').to_pylist(),
)
rf = libarr['cells'].values.field('refs') rf = libarr['cells'].values.field('refs')
refs = dict( refs = dict(
offsets = rf.offsets.to_numpy(), offsets = rf.offsets.to_numpy(),
@ -292,10 +317,9 @@ def read_arrow(
) )
elements = dict( elements = dict(
boundaries = get_geom(libarr, 'boundaries'), boundary_batches = get_boundary_batches(libarr),
boundary_props = get_boundary_props(libarr),
paths = get_geom(libarr, 'paths'), paths = get_geom(libarr, 'paths'),
boxes = get_geom(libarr, 'boxes'),
nodes = get_geom(libarr, 'nodes'),
texts = texts, texts = texts,
refs = refs, refs = refs,
) )
@ -320,7 +344,8 @@ def read_arrow(
for cc in range(len(libarr['cells'])): for cc in range(len(libarr['cells'])):
name = cell_names[cell_ids[cc]] name = cell_names[cell_ids[cc]]
pat = Pattern() pat = Pattern()
_boundaries_to_polygons(pat, global_args, elements['boundaries'], cc) _boundary_batches_to_polygons(pat, global_args, elements['boundary_batches'], cc)
_boundary_props_to_polygons(pat, global_args, elements['boundary_props'], cc)
_gpaths_to_mpaths(pat, global_args, elements['paths'], cc) _gpaths_to_mpaths(pat, global_args, elements['paths'], cc)
_grefs_to_mrefs(pat, global_args, elements['refs'], cc) _grefs_to_mrefs(pat, global_args, elements['refs'], cc)
_texts_to_labels(pat, global_args, elements['texts'], cc) _texts_to_labels(pat, global_args, elements['texts'], cc)
@ -366,6 +391,7 @@ def _grefs_to_mrefs(
elem_rep_xy1 = elem['rep_xy1'][elem_slc][:elem_count] elem_rep_xy1 = elem['rep_xy1'][elem_slc][:elem_count]
elem_rep_counts = elem['rep_counts'][elem_slc][:elem_count] elem_rep_counts = elem['rep_counts'][elem_slc][:elem_count]
rep_valid = elem['rep_valid'][elem_slc][:elem_count] rep_valid = elem['rep_valid'][elem_slc][:elem_count]
raw_mode = global_args['raw_mode']
for ee in range(elem_count): for ee in range(elem_count):
@ -380,9 +406,15 @@ def _grefs_to_mrefs(
a_vector = elem_rep_xy0[ee] a_vector = elem_rep_xy0[ee]
b_vector = elem_rep_xy1[ee] b_vector = elem_rep_xy1[ee]
a_count, b_count = elem_rep_counts[ee] a_count, b_count = elem_rep_counts[ee]
if raw_mode:
rep = Grid._from_raw(a_vector=a_vector, b_vector=b_vector, a_count=a_count, b_count=b_count)
else:
rep = Grid(a_vector=a_vector, b_vector=b_vector, a_count=a_count, b_count=b_count) rep = Grid(a_vector=a_vector, b_vector=b_vector, a_count=a_count, b_count=b_count)
annotations = _read_annotations(prop_offs, prop_key, prop_val, ee) annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
if raw_mode:
ref = Ref._from_raw(offset=offset, mirrored=mirr, rotation=rot, scale=mag, repetition=rep, annotations=annotations)
else:
ref = Ref(offset=offset, mirrored=mirr, rotation=rot, scale=mag, repetition=rep, annotations=annotations) ref = Ref(offset=offset, mirrored=mirr, rotation=rot, scale=mag, repetition=rep, annotations=annotations)
pat.refs[target].append(ref) pat.refs[target].append(ref)
@ -406,6 +438,7 @@ def _texts_to_labels(
elem_xy = xy[elem_slc][:elem_count] elem_xy = xy[elem_slc][:elem_count]
elem_layer_inds = layer_inds[elem_slc][:elem_count] elem_layer_inds = layer_inds[elem_slc][:elem_count]
elem_strings = elem['string'][elem_slc][:elem_count] elem_strings = elem['string'][elem_slc][:elem_count]
raw_mode = global_args['raw_mode']
for ee in range(elem_count): for ee in range(elem_count):
layer = layer_tups[elem_layer_inds[ee]] layer = layer_tups[elem_layer_inds[ee]]
@ -413,6 +446,9 @@ def _texts_to_labels(
string = elem_strings[ee] string = elem_strings[ee]
annotations = _read_annotations(prop_offs, prop_key, prop_val, ee) annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
if raw_mode:
mlabel = Label._from_raw(string=string, offset=offset, annotations=annotations)
else:
mlabel = Label(string=string, offset=offset, annotations=annotations) mlabel = Label(string=string, offset=offset, annotations=annotations)
pat.labels[layer].append(mlabel) pat.labels[layer].append(mlabel)
@ -439,7 +475,6 @@ def _gpaths_to_mpaths(
elem_path_types = elem['path_type'][elem_slc][:elem_count] elem_path_types = elem['path_type'][elem_slc][:elem_count]
elem_extensions = elem['extensions'][elem_slc][:elem_count] elem_extensions = elem['extensions'][elem_slc][:elem_count]
zeros = numpy.zeros((elem_count, 2))
raw_mode = global_args['raw_mode'] raw_mode = global_args['raw_mode']
for ee in range(elem_count): for ee in range(elem_count):
layer = layer_tups[elem_layer_inds[ee]] layer = layer_tups[elem_layer_inds[ee]]
@ -453,64 +488,77 @@ def _gpaths_to_mpaths(
cap_extensions = None cap_extensions = None
annotations = _read_annotations(prop_offs, prop_key, prop_val, ee) annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
path = Path(vertices=vertices, offset=zeros[ee], annotations=annotations, raw=raw_mode, path = Path(vertices=vertices, offset=ZERO_OFFSET, annotations=annotations, raw=raw_mode,
width=width, cap=cap,cap_extensions=cap_extensions) width=width, cap=cap,cap_extensions=cap_extensions)
pat.shapes[layer].append(path) pat.shapes[layer].append(path)
def _boundaries_to_polygons( def _boundary_batches_to_polygons(
pat: Pattern, pat: Pattern,
global_args: dict[str, Any], global_args: dict[str, Any],
elem: dict[str, Any], elem: dict[str, Any],
cc: int, cc: int,
) -> None: ) -> None:
elem_off = elem['offsets'] # which elements belong to each cell elem_off = elem['offsets'] # which elements belong to each cell
xy_val = elem['xy_arr'] vert_arr = elem['vert_arr']
vert_off = elem['vert_off']
layer_inds = elem['layer_inds']
layer_tups = global_args['layer_tups']
poly_off = elem['poly_off']
poly_offsets = elem['poly_offsets']
batch_count = elem_off[cc + 1] - elem_off[cc]
if batch_count == 0:
return
elem_slc = slice(elem_off[cc], elem_off[cc] + batch_count + 1) # +1 to capture ending location for last elem
elem_vert_off = vert_off[elem_slc]
elem_poly_off = poly_off[elem_slc]
elem_layer_inds = layer_inds[elem_slc][:batch_count]
raw_mode = global_args['raw_mode']
for bb in range(batch_count):
layer = layer_tups[elem_layer_inds[bb]]
vertices = vert_arr[elem_vert_off[bb]:elem_vert_off[bb + 1]]
vertex_offsets = numpy.asarray(poly_offsets[elem_poly_off[bb]:elem_poly_off[bb + 1]], dtype=numpy.intp)
if vertex_offsets.size == 1:
poly = Polygon(vertices=vertices, offset=ZERO_OFFSET, annotations=None, raw=raw_mode)
pat.shapes[layer].append(poly)
else:
polys = PolyCollection(vertex_lists=vertices, vertex_offsets=vertex_offsets, offset=ZERO_OFFSET, annotations=None, raw=raw_mode)
pat.shapes[layer].append(polys)
def _boundary_props_to_polygons(
pat: Pattern,
global_args: dict[str, Any],
elem: dict[str, Any],
cc: int,
) -> None:
elem_off = elem['offsets']
vert_arr = elem['vert_arr']
vert_off = elem['vert_off']
layer_inds = elem['layer_inds'] layer_inds = elem['layer_inds']
layer_tups = global_args['layer_tups'] layer_tups = global_args['layer_tups']
prop_key = elem['prop_key'] prop_key = elem['prop_key']
prop_val = elem['prop_val'] prop_val = elem['prop_val']
elem_count = elem_off[cc + 1] - elem_off[cc] elem_count = elem_off[cc + 1] - elem_off[cc]
elem_slc = slice(elem_off[cc], elem_off[cc] + elem_count + 1) # +1 to capture ending location for last elem if elem_count == 0:
xy_offs = elem['xy_off'][elem_slc] # which xy coords belong to each element return
xy_counts = xy_offs[1:] - xy_offs[:-1]
prop_offs = elem['prop_off'][elem_slc] # which props belong to each element elem_slc = slice(elem_off[cc], elem_off[cc] + elem_count + 1)
prop_counts = prop_offs[1:] - prop_offs[:-1] elem_vert_off = vert_off[elem_slc]
prop_offs = elem['prop_off'][elem_slc]
elem_layer_inds = layer_inds[elem_slc][:elem_count] elem_layer_inds = layer_inds[elem_slc][:elem_count]
order = numpy.argsort(elem_layer_inds, stable=True)
unilayer_inds, unilayer_first, unilayer_count = numpy.unique(elem_layer_inds, return_index=True, return_counts=True)
zeros = numpy.zeros((elem_count, 2))
raw_mode = global_args['raw_mode'] raw_mode = global_args['raw_mode']
for layer_ind, ff, nn in zip(unilayer_inds, unilayer_first, unilayer_count, strict=True): for ee in range(elem_count):
ee_inds = order[ff:ff + nn]
layer = layer_tups[layer_ind]
propless_mask = prop_counts[ee_inds] == 0
poly_count_on_layer = propless_mask.sum()
if poly_count_on_layer == 1:
propless_mask[:] = 0 # Never make a 1-element collection
elif poly_count_on_layer > 1:
propless_vert_counts = xy_counts[ee_inds[propless_mask]] - 1 # -1 to drop closing point
vertex_lists = numpy.empty((propless_vert_counts.sum(), 2), dtype=numpy.float64)
vertex_offsets = numpy.cumsum(numpy.concatenate([[0], propless_vert_counts]))
for ii, ee in enumerate(ee_inds[propless_mask]):
vo = vertex_offsets[ii]
vertex_lists[vo:vo + propless_vert_counts[ii]] = xy_val[xy_offs[ee]:xy_offs[ee + 1] - 1]
polys = PolyCollection(vertex_lists=vertex_lists, vertex_offsets=vertex_offsets, offset=zeros[ee])
pat.shapes[layer].append(polys)
# Handle single polygons
for ee in ee_inds[~propless_mask]:
layer = layer_tups[elem_layer_inds[ee]] layer = layer_tups[elem_layer_inds[ee]]
vertices = xy_val[xy_offs[ee]:xy_offs[ee + 1] - 1] # -1 to drop closing point vertices = vert_arr[elem_vert_off[ee]:elem_vert_off[ee + 1]]
annotations = _read_annotations(prop_offs, prop_key, prop_val, ee) annotations = _read_annotations(prop_offs, prop_key, prop_val, ee)
poly = Polygon(vertices=vertices, offset=zeros[ee], annotations=annotations, raw=raw_mode) poly = Polygon(vertices=vertices, offset=ZERO_OFFSET, annotations=annotations, raw=raw_mode)
pat.shapes[layer].append(poly) pat.shapes[layer].append(poly)

View file

@ -5,6 +5,7 @@ import pytest
pytest.importorskip('pyarrow') pytest.importorskip('pyarrow')
from .. import Ref, Label
from ..library import Library from ..library import Library
from ..pattern import Pattern from ..pattern import Pattern
from ..repetition import Grid from ..repetition import Grid
@ -119,7 +120,10 @@ def _make_arrow_test_library() -> Library:
leaf = Pattern() leaf = Pattern()
leaf.polygon((1, 0), vertices=[[0, 0], [10, 0], [10, 10], [0, 10]], annotations={'1': ['leaf-poly']}) leaf.polygon((1, 0), vertices=[[0, 0], [10, 0], [10, 10], [0, 10]], annotations={'1': ['leaf-poly']})
leaf.polygon((2, 0), vertices=[[40, 0], [50, 0], [50, 10], [40, 10]])
leaf.polygon((1, 0), vertices=[[20, 0], [30, 0], [30, 10], [20, 10]]) leaf.polygon((1, 0), vertices=[[20, 0], [30, 0], [30, 10], [20, 10]])
leaf.polygon((1, 0), vertices=[[80, 0], [90, 0], [90, 10], [80, 10]])
leaf.polygon((2, 0), vertices=[[60, 0], [70, 0], [70, 10], [60, 10]], annotations={'18': ['leaf-poly-2']})
leaf.label((10, 0), string='LEAF', offset=(3, 4), annotations={'10': ['leaf-label']}) leaf.label((10, 0), string='LEAF', offset=(3, 4), annotations={'10': ['leaf-label']})
lib['leaf'] = leaf lib['leaf'] = leaf
@ -178,3 +182,80 @@ def test_gdsii_arrow_reads_small_perf_fixture(tmp_path: Path) -> None:
assert len(lib) == manifest.cells assert len(lib) == manifest.cells
assert 'TOP' in lib assert 'TOP' in lib
assert sum(len(refs) for refs in lib['TOP'].refs.values()) > 0 assert sum(len(refs) for refs in lib['TOP'].refs.values()) > 0
def test_gdsii_arrow_boundary_batch_schema(tmp_path: Path) -> None:
lib = _make_arrow_test_library()
gds_file = tmp_path / 'arrow_batches.gds'
gdsii.writefile(lib, gds_file, meters_per_unit=1e-9)
libarr = gdsii_arrow._read_to_arrow(gds_file)[0]
cells = libarr['cells'].values
cell_ids = cells.field('id').to_numpy()
cell_names = libarr['cell_names'].as_py()
layer_table = [
((int(layer) >> 16) & 0xFFFF, int(layer) & 0xFFFF)
for layer in libarr['layers'].values.to_numpy()
]
leaf_index = next(ii for ii, cell_id in enumerate(cell_ids) if cell_names[cell_id] == 'leaf')
boundary_batches = cells.field('boundary_batches')[leaf_index].as_py()
boundary_props = cells.field('boundary_props')[leaf_index].as_py()
assert len(boundary_batches) == 2
assert len(boundary_props) == 2
batch_by_layer = {tuple(layer_table[entry['layer']]): entry for entry in boundary_batches}
assert batch_by_layer[(1, 0)]['vertex_offsets'] == [0, 4]
assert len(batch_by_layer[(1, 0)]['vertices']) == 16
assert batch_by_layer[(2, 0)]['vertex_offsets'] == [0]
assert len(batch_by_layer[(2, 0)]['vertices']) == 8
props_by_layer = {tuple(layer_table[entry['layer']]): entry for entry in boundary_props}
assert sorted(props_by_layer) == [(1, 0), (2, 0)]
assert props_by_layer[(1, 0)]['properties'][0]['value'] == 'leaf-poly'
assert props_by_layer[(2, 0)]['properties'][0]['value'] == 'leaf-poly-2'
def test_raw_ref_grid_label_constructors_match_public() -> None:
raw_grid = Grid._from_raw(
a_vector=numpy.array([20, 0]),
a_count=3,
b_vector=numpy.array([0, 30]),
b_count=2,
)
public_grid = Grid(a_vector=(20, 0), a_count=3, b_vector=(0, 30), b_count=2)
assert raw_grid == public_grid
raw_ref = Ref._from_raw(
offset=numpy.array([100, 200]),
rotation=numpy.pi / 2,
mirrored=True,
scale=1.25,
repetition=raw_grid,
annotations={'12': ['child-ref']},
)
public_ref = Ref(
offset=(100, 200),
rotation=numpy.pi / 2,
mirrored=True,
scale=1.25,
repetition=public_grid,
annotations={'12': ['child-ref']},
)
assert raw_ref == public_ref
assert numpy.array_equal(raw_ref.as_transforms(), public_ref.as_transforms())
raw_label = Label._from_raw(
'LEAF',
offset=numpy.array([3, 4]),
annotations={'10': ['leaf-label']},
)
public_label = Label(
'LEAF',
offset=(3, 4),
annotations={'10': ['leaf-label']},
)
assert raw_label == public_label
assert numpy.array_equal(raw_label.get_bounds_single(), public_label.get_bounds_single())