Skip to content

Commit 2e4483d

Browse files
committed
Remove vestigial token balancing code
There's a big TODO about removing this when we finally fully forked lxhtml's differ. That happened a long time ago, and we did in fact make the changes that turned this into effectively wasted iteration/dead code. I ran a few tests over a variety of big and small diffs to make sure the code being removed here really doesn't do anything anymore, and that seems to be the case. Reading the logic, it also seems like this should be entirely vestigial, and never wind up actually changing the tokens.
1 parent 38245ef commit 2e4483d

File tree

1 file changed

+13
-48
lines changed

1 file changed

+13
-48
lines changed

web_monitoring_diff/html_render_diff.py

+13-48
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,19 @@ def fixup_chunks(chunks, comparator):
850850
tag_accum.append(chunk[1])
851851

852852
elif current_token == TokenType.end_tag:
853+
# Ensure any closing tags get added to the previous token as
854+
# `post_tags`, rather than the next token as `pre_tags`. This makes
855+
# placing the end of elements in the right place when re-assembling
856+
# the final diff from added/removed tokens easier to do.
857+
#
858+
# That is, given HTML like:
859+
#
860+
# <p><a>Hello!</a></p><div>…there.</div>
861+
#
862+
# We want output like:
863+
#
864+
# [('Hello!', pre=['<p>','<a>'], post=['</a>','</p>']),
865+
# ('…there.', pre=[<div>'], post=['</div>'])]
853866
if tag_accum:
854867
tag_accum.append(chunk[1])
855868
else:
@@ -1034,54 +1047,6 @@ def __hash__(self):
10341047
def _customize_tokens(tokens):
10351048
SPACER_STRING = '\nSPACER'
10361049

1037-
# Balance out pre- and post-tags so that a token of text is surrounded by
1038-
# the opening and closing tags of the element it's in. For example:
1039-
#
1040-
# <p><a>Hello!</a></p><div>…there.</div>
1041-
#
1042-
# Currently parses as:
1043-
# [('Hello!', pre=['<p>','<a>'], post=[]),
1044-
# ('…there.', pre=['</a>','</p>','<div>'], post=['</div>'])]
1045-
# (Note the '</div>' post tag is only present at the end of the doc)
1046-
#
1047-
# But this attempts make it more like:
1048-
#
1049-
# [('Hello!', pre=['<p>','<a>'], post=['</a>','</p>']),
1050-
# ('…there.', pre=[<div>'], post=['</div>'])]
1051-
#
1052-
# TODO: when we get around to also forking the parse/tokenize part of this
1053-
# diff, do this as part of the original tokenization instead.
1054-
for token_index, token in enumerate(tokens):
1055-
# logger.debug(f'Handling token {token_index}: {token}')
1056-
if token_index == 0:
1057-
continue
1058-
previous = tokens[token_index - 1]
1059-
previous_post_complete = False
1060-
for post_index, tag in enumerate(previous.post_tags):
1061-
if not tag.startswith('</'):
1062-
# TODO: should we attempt to fill pure-structure tags here with
1063-
# spacers? e.g. should we take the "<p><em></em></p>" here and
1064-
# wrap a spacer token in it instead of moving to "next-text's"
1065-
# pre_tags? "text</p><p><em></em></p><p>next-text"
1066-
token.pre_tags = previous.post_tags[post_index:] + token.pre_tags
1067-
previous.post_tags = previous.post_tags[:post_index]
1068-
previous_post_complete = True
1069-
break
1070-
1071-
if not previous_post_complete:
1072-
for pre_index, tag in enumerate(token.pre_tags):
1073-
if not tag.startswith('</'):
1074-
if pre_index > 0:
1075-
previous.post_tags.extend(token.pre_tags[:pre_index])
1076-
token.pre_tags = token.pre_tags[pre_index:]
1077-
break
1078-
else:
1079-
previous.post_tags.extend(token.pre_tags)
1080-
token.pre_tags = []
1081-
1082-
1083-
# logger.debug(f' Result...\n pre: {token.pre_tags}\n token: "{token}"\n post: {token.post_tags}')
1084-
10851050
result = []
10861051
# for token in tokens:
10871052
for token_index, token in enumerate(tokens):

0 commit comments

Comments
 (0)