@@ -850,6 +850,19 @@ def fixup_chunks(chunks, comparator):
850
850
tag_accum .append (chunk [1 ])
851
851
852
852
elif current_token == TokenType .end_tag :
853
+ # Ensure any closing tags get added to the previous token as
854
+ # `post_tags`, rather than the next token as `pre_tags`. This makes
855
+ # placing the end of elements in the right place when re-assembling
856
+ # the final diff from added/removed tokens easier to do.
857
+ #
858
+ # That is, given HTML like:
859
+ #
860
+ # <p><a>Hello!</a></p><div>…there.</div>
861
+ #
862
+ # We want output like:
863
+ #
864
+ # [('Hello!', pre=['<p>','<a>'], post=['</a>','</p>']),
865
+ # ('…there.', pre=[<div>'], post=['</div>'])]
853
866
if tag_accum :
854
867
tag_accum .append (chunk [1 ])
855
868
else :
@@ -1034,54 +1047,6 @@ def __hash__(self):
1034
1047
def _customize_tokens (tokens ):
1035
1048
SPACER_STRING = '\n SPACER'
1036
1049
1037
- # Balance out pre- and post-tags so that a token of text is surrounded by
1038
- # the opening and closing tags of the element it's in. For example:
1039
- #
1040
- # <p><a>Hello!</a></p><div>…there.</div>
1041
- #
1042
- # Currently parses as:
1043
- # [('Hello!', pre=['<p>','<a>'], post=[]),
1044
- # ('…there.', pre=['</a>','</p>','<div>'], post=['</div>'])]
1045
- # (Note the '</div>' post tag is only present at the end of the doc)
1046
- #
1047
- # But this attempts make it more like:
1048
- #
1049
- # [('Hello!', pre=['<p>','<a>'], post=['</a>','</p>']),
1050
- # ('…there.', pre=[<div>'], post=['</div>'])]
1051
- #
1052
- # TODO: when we get around to also forking the parse/tokenize part of this
1053
- # diff, do this as part of the original tokenization instead.
1054
- for token_index , token in enumerate (tokens ):
1055
- # logger.debug(f'Handling token {token_index}: {token}')
1056
- if token_index == 0 :
1057
- continue
1058
- previous = tokens [token_index - 1 ]
1059
- previous_post_complete = False
1060
- for post_index , tag in enumerate (previous .post_tags ):
1061
- if not tag .startswith ('</' ):
1062
- # TODO: should we attempt to fill pure-structure tags here with
1063
- # spacers? e.g. should we take the "<p><em></em></p>" here and
1064
- # wrap a spacer token in it instead of moving to "next-text's"
1065
- # pre_tags? "text</p><p><em></em></p><p>next-text"
1066
- token .pre_tags = previous .post_tags [post_index :] + token .pre_tags
1067
- previous .post_tags = previous .post_tags [:post_index ]
1068
- previous_post_complete = True
1069
- break
1070
-
1071
- if not previous_post_complete :
1072
- for pre_index , tag in enumerate (token .pre_tags ):
1073
- if not tag .startswith ('</' ):
1074
- if pre_index > 0 :
1075
- previous .post_tags .extend (token .pre_tags [:pre_index ])
1076
- token .pre_tags = token .pre_tags [pre_index :]
1077
- break
1078
- else :
1079
- previous .post_tags .extend (token .pre_tags )
1080
- token .pre_tags = []
1081
-
1082
-
1083
- # logger.debug(f' Result...\n pre: {token.pre_tags}\n token: "{token}"\n post: {token.post_tags}')
1084
-
1085
1050
result = []
1086
1051
# for token in tokens:
1087
1052
for token_index , token in enumerate (tokens ):
0 commit comments