Skip to content

Commit 64a850d

Browse files
authored
Make wasm-sourcemap.py faster (#25935)
1. This replaces uses of `Pattern.split` with `Pattern.finditer`. 2. In `extract_func_ranges`, instead of splitting the whole text into single tags first and searching `DW_TAG_(subprogram|inlined_subroutine)` in each of them, we search the pattern from the whole text using `finditer` and tries to parse a tag from each match. This improves the running time on `wasm-opt.wasm` by ~18% (14.9s -> 12.2s).
1 parent b3836dc commit 64a850d

File tree

1 file changed

+72
-41
lines changed

1 file changed

+72
-41
lines changed

tools/wasm-sourcemap.py

Lines changed: 72 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -224,14 +224,23 @@ def extract_comp_dir_map(text):
224224
comp_dir_pattern = re.compile(r"DW_AT_comp_dir\s+\(\"([^\"]+)\"\)")
225225

226226
map_stmt_list_to_comp_dir = {}
227-
chunks = compile_unit_pattern.split(text) # DW_TAG_compile_unit
228-
for chunk in chunks[1:]:
229-
stmt_list_match = stmt_list_pattern.search(chunk) # DW_AT_stmt_list
227+
iterator = compile_unit_pattern.finditer(text)
228+
current_match = next(iterator, None)
229+
230+
while current_match:
231+
next_match = next(iterator, None)
232+
start = current_match.end()
233+
end = next_match.start() if next_match else len(text)
234+
235+
stmt_list_match = stmt_list_pattern.search(text, start, end)
230236
if stmt_list_match is not None:
231237
stmt_list = stmt_list_match.group(1)
232-
comp_dir_match = comp_dir_pattern.search(chunk) # DW_AT_comp_dir
238+
comp_dir_match = comp_dir_pattern.search(text, start, end)
233239
comp_dir = decode_octal_encoded_utf8(comp_dir_match.group(1)) if comp_dir_match is not None else ''
234240
map_stmt_list_to_comp_dir[stmt_list] = comp_dir
241+
242+
current_match = next_match
243+
235244
return map_stmt_list_to_comp_dir
236245

237246

@@ -313,54 +322,60 @@ def extract_func_ranges(text):
313322
# DW_AT_high_pc (0x00000083)
314323
# ...
315324

316-
tag_pattern = re.compile(r'\r?\n(?=0x[0-9a-f]+:)')
317-
subprogram_pattern = re.compile(r"0x[0-9a-f]+:\s+DW_TAG_subprogram")
318-
inlined_pattern = re.compile(r"0x[0-9a-f]+:\s+DW_TAG_inlined_subroutine")
325+
# Pattern to find the start of the NEXT DWARF tag (boundary marker)
326+
next_tag_pattern = re.compile(r'\n0x[0-9a-f]+:')
327+
# Pattern to find DWARF tags for functions (Subprogram or Inlined) directly
328+
func_pattern = re.compile(r'DW_TAG_(?:subprogram|inlined_subroutine)')
329+
319330
low_pc_pattern = re.compile(r'DW_AT_low_pc\s+\(0x([0-9a-f]+)\)')
320331
high_pc_pattern = re.compile(r'DW_AT_high_pc\s+\(0x([0-9a-f]+)\)')
321332
abstract_origin_pattern = re.compile(r'DW_AT_abstract_origin\s+\(0x[0-9a-f]+\s+"([^"]+)"\)')
322333
linkage_name_pattern = re.compile(r'DW_AT_linkage_name\s+\("([^"]+)"\)')
323334
name_pattern = re.compile(r'DW_AT_name\s+\("([^"]+)"\)')
324335
specification_pattern = re.compile(r'DW_AT_specification\s+\(0x[0-9a-f]+\s+"([^"]+)"\)')
325336

326-
func_ranges = []
327-
dw_tags = tag_pattern.split(text)
328-
329-
def get_name_from_tag(tag):
330-
m = linkage_name_pattern.search(tag) # DW_AT_linkage_name
337+
def get_name_from_tag(start, end):
338+
m = linkage_name_pattern.search(text, start, end)
331339
if m:
332340
return m.group(1)
333-
m = name_pattern.search(tag) # DW_AT_name
341+
m = name_pattern.search(text, start, end)
334342
if m:
335343
return m.group(1)
336344
# If name is missing, check for DW_AT_specification annotation
337-
m = specification_pattern.search(tag)
345+
m = specification_pattern.search(text, start, end)
338346
if m:
339347
return m.group(1)
340348
return None
341349

342-
for tag in dw_tags:
343-
is_subprogram = subprogram_pattern.search(tag) # DW_TAG_subprogram
344-
is_inlined = inlined_pattern.search(tag) # DW_TAG_inlined_subroutine
350+
func_ranges = []
351+
for match in func_pattern.finditer(text):
352+
# Search from the end of the tag name (e.g. after "DW_TAG_subprogram").
353+
# Attributes are expected to follow.
354+
search_start = match.end()
355+
356+
# Search until the beginning of the next tag
357+
m_next = next_tag_pattern.search(text, search_start)
358+
search_end = m_next.start() if m_next else len(text)
359+
360+
name = None
361+
low_pc = None
362+
high_pc = None
363+
m = low_pc_pattern.search(text, search_start, search_end)
364+
if m:
365+
low_pc = int(m.group(1), 16)
366+
m = high_pc_pattern.search(text, search_start, search_end)
367+
if m:
368+
high_pc = int(m.group(1), 16)
345369

346-
if is_subprogram or is_inlined:
347-
name = None
348-
low_pc = None
349-
high_pc = None
350-
m = low_pc_pattern.search(tag) # DW_AT_low_pc
351-
if m:
352-
low_pc = int(m.group(1), 16)
353-
m = high_pc_pattern.search(tag) # DW_AT_high_pc
370+
if 'DW_TAG_subprogram' in match.group(0):
371+
name = get_name_from_tag(search_start, search_end)
372+
else: # is_inlined
373+
m = abstract_origin_pattern.search(text, search_start, search_end)
354374
if m:
355-
high_pc = int(m.group(1), 16)
356-
if is_subprogram:
357-
name = get_name_from_tag(tag)
358-
else: # is_inlined
359-
m = abstract_origin_pattern.search(tag) # DW_AT_abstract_origin
360-
if m:
361-
name = m.group(1)
362-
if name and low_pc is not None and high_pc is not None:
363-
func_ranges.append(FuncRange(name, low_pc, high_pc))
375+
name = m.group(1)
376+
377+
if name and low_pc is not None and high_pc is not None:
378+
func_ranges.append(FuncRange(name, low_pc, high_pc))
364379

365380
# Demangle names
366381
all_names = [item.name for item in func_ranges]
@@ -401,9 +416,23 @@ def read_dwarf_info(wasm, options):
401416
line_pattern = re.compile(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?")
402417

403418
entries = []
404-
debug_line_chunks = debug_line_pattern.split(output)
405-
map_stmt_list_to_comp_dir = extract_comp_dir_map(debug_line_chunks[0])
406-
for stmt_list, line_chunk in zip(debug_line_chunks[1::2], debug_line_chunks[2::2], strict=True):
419+
iterator = debug_line_pattern.finditer(output)
420+
try:
421+
current_match = next(iterator)
422+
debug_info_end = current_match.start() # end of .debug_info contents
423+
except StopIteration:
424+
debug_info_end = len(output)
425+
426+
debug_info = output[:debug_info_end] # .debug_info contents
427+
map_stmt_list_to_comp_dir = extract_comp_dir_map(debug_info)
428+
429+
while current_match:
430+
next_match = next(iterator, None)
431+
432+
stmt_list = current_match.group(1)
433+
start = current_match.end()
434+
end = next_match.start() if next_match else len(output)
435+
407436
comp_dir = map_stmt_list_to_comp_dir.get(stmt_list, '')
408437

409438
# include_directories[ 1] = "/Users/yury/Work/junk/sqlite-playground/src"
@@ -422,16 +451,16 @@ def read_dwarf_info(wasm, options):
422451
# 0x0000000000000011 28 0 1 0 0 is_stmt
423452

424453
include_directories = {'0': comp_dir}
425-
for dir in include_dir_pattern.finditer(line_chunk):
454+
for dir in include_dir_pattern.finditer(output, start, end):
426455
include_directories[dir.group(1)] = os.path.join(comp_dir, decode_octal_encoded_utf8(dir.group(2)))
427456

428457
files = {}
429-
for file in file_pattern.finditer(line_chunk):
458+
for file in file_pattern.finditer(output, start, end):
430459
dir = include_directories[file.group(3)]
431460
file_path = os.path.join(dir, decode_octal_encoded_utf8(file.group(2)))
432461
files[file.group(1)] = file_path
433462

434-
for line in line_pattern.finditer(line_chunk):
463+
for line in line_pattern.finditer(output, start, end):
435464
entry = {'address': int(line.group(1), 16), 'line': int(line.group(2)), 'column': int(line.group(3)), 'file': files[line.group(4)], 'eos': line.group(5) is not None}
436465
if not entry['eos']:
437466
entries.append(entry)
@@ -444,12 +473,14 @@ def read_dwarf_info(wasm, options):
444473
else:
445474
entries.append(entry)
446475

476+
current_match = next_match
477+
447478
remove_dead_entries(entries)
448479

449480
# return entries sorted by the address field
450481
entries = sorted(entries, key=lambda entry: entry['address'])
451482

452-
func_ranges = extract_func_ranges(debug_line_chunks[0])
483+
func_ranges = extract_func_ranges(debug_info)
453484
return entries, func_ranges
454485

455486

0 commit comments

Comments
 (0)