diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e7a07be..2fe66d0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,10 +18,19 @@ jobs: matrix: os: [ubuntu-latest, windows-latest, macOS-latest] python-version: [3.8, 3.11] - + + permissions: + contents: write + steps: - - uses: actions/checkout@v3 - + - if: ${{ github.event_name == 'pull_request' }} + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.ref }} + + - if: ${{ github.event_name != 'pull_request' }} + uses: actions/checkout@v4 + - name: Install poetry run: pipx install poetry diff --git a/coverage_badge.svg b/coverage_badge.svg index e5db27c..6bfc8fa 100644 --- a/coverage_badge.svg +++ b/coverage_badge.svg @@ -15,7 +15,7 @@ coverage coverage - 100% - 100% + 99% + 99% diff --git a/ens_normalize/normalization.py b/ens_normalize/normalization.py index 6f95efb..65d9435 100644 --- a/ens_normalize/normalization.py +++ b/ens_normalize/normalization.py @@ -558,7 +558,10 @@ def post_check_empty(name: str, input: str) -> Optional[CurableSequence]: # fully ignorable name return CurableSequence( CurableSequenceType.EMPTY_LABEL, - index=0, + # We set the index to -1 to let offset_err_start() + # know that this is the special empty name case. + # Otherwise, it would offset the index past the ignored characters. + index=-1, sequence=input, suggested='', ) @@ -581,7 +584,7 @@ def post_check_empty(name: str, input: str) -> Optional[CurableSequence]: return CurableSequence( CurableSequenceType.EMPTY_LABEL, index=i, - sequence='..', + sequence='..', # !! suggested='.', ) @@ -598,7 +601,7 @@ def post_check_underscore(label: str) -> Optional[CurableSequence]: return CurableSequence( CurableSequenceType.UNDERSCORE, index=i, - sequence='_' * cnt, + sequence='_' * cnt, # !! suggested='', ) @@ -608,7 +611,7 @@ def post_check_hyphen(label: str) -> Optional[CurableSequence]: return CurableSequence( CurableSequenceType.HYPHEN, index=2, - sequence='--', + sequence='--', # !! suggested='', ) @@ -648,7 +651,7 @@ def make_fenced_error(cps: List[int], start: int, end: int) -> CurableSequence: return CurableSequence( type_, index=start, - sequence=''.join(map(chr, cps[start:end])), + sequence=''.join(map(chr, cps[start:end])), # !! suggested=suggested, ) @@ -1057,7 +1060,7 @@ def ens_process( label_is_greek = [] error = post_check(emojis_as_fe0f, label_is_greek, input) if isinstance(error, CurableSequence): # or NormalizableSequence because of inheritance - offset_err_start(error, tokens) + offset_err_start(error, tokens, input) # else: # only the result of post_check() is not input aligned @@ -1092,17 +1095,64 @@ def ens_process( ) -def offset_err_start(err: Optional[CurableSequence], tokens: List[Token]): +def restore_ignored_in_sequence(seq: str, input: str) -> str: + """ + Restore any ignored characters from the input string into the sequence. + + Args: + seq: The sequence to restore ignored characters into + input: The input string that may contain ignored characters + + Returns: + The sequence with ignored characters restored + """ + if not seq: + return seq + + seq_out = [] + input_i = 0 + seq_len = len(seq) + matched = 0 + + # Keep going until we've matched all characters in seq + while matched < seq_len and input_i < len(input): + # For mapped characters, we need to check if the current input char + # maps to our target sequence char + input_cp = ord(input[input_i]) + mapped_cps = NORMALIZATION.mapped.get(input_cp, [input_cp]) + target_cp = ord(seq[matched]) + + if input_cp == target_cp or target_cp in mapped_cps: + seq_out.append(input[input_i]) + matched += 1 + elif matched > 0: + # If we've started matching but hit a non-match, + # include ignored characters between matches + seq_out.append(input[input_i]) + input_i += 1 + + # If we didn't match everything, use the original sequence + if matched < seq_len: + return seq + + return ''.join(seq_out) + + +def offset_err_start(err: Optional[CurableSequence], tokens: List[Token], input: str): """ Output of post_check() is not input aligned. This function offsets the error index (in-place) to match the input characters. """ + if err.index < 0: + # empty name case + err.index = 0 + return # index in string that was scanned i = 0 # offset between input and scanned offset = 0 for tok in tokens: - if i >= err.index: + if i > err.index: # everything before the error is aligned break if tok.type in (TY_IGNORED, TY_DISALLOWED): @@ -1127,6 +1177,7 @@ def offset_err_start(err: Optional[CurableSequence], tokens: List[Token]): # input: cps, scanned: cps i += len(tok.cps) err.index += offset + err.sequence = restore_ignored_in_sequence(err.sequence, input[err.index :]) def ens_normalize(text: str) -> str: diff --git a/tests/test_normalization.py b/tests/test_normalization.py index 93950a2..19b0c09 100644 --- a/tests/test_normalization.py +++ b/tests/test_normalization.py @@ -532,3 +532,44 @@ def test_simple_name_optimization(): assert len(r.cures) == 0 assert r.error is None assert r.normalizations is None + + +@pytest.mark.parametrize( + 'input_str, expected_code, expected_index, expected_sequence, expected_suggested', + [ + ('nick.\ufe0f\ufe0f.eth', 'EMPTY_LABEL', 4, '.\ufe0f\ufe0f.', '.'), + ('01\ufe0f--345', 'HYPHEN', 3, '--', ''), + ('01-\ufe0f-345', 'HYPHEN', 2, '-\ufe0f-', ''), + ("\ufe0f'b", 'FENCED_LEADING', 1, "'", ''), + ], +) +def test_suggestions_with_ignored(input_str, expected_code, expected_index, expected_sequence, expected_suggested): + e = ens_process(input_str).error + assert e.code == expected_code + assert e.index == expected_index + assert e.sequence == expected_sequence + assert e.suggested == expected_suggested + + +@pytest.mark.parametrize( + 'input_str, expected_type, expected_index, expected_sequence, expected_suggested', + [ + # Test mapped characters with ignored characters + ('aA\ufe0fA', NormalizableSequenceType.MAPPED, 1, 'A', 'a'), # Single capital A gets mapped + ('aAB', NormalizableSequenceType.MAPPED, 1, 'A', 'a'), # First capital gets mapped + # Test FE0F normalization + ('a🚴‍♂️', NormalizableSequenceType.FE0F, 1, '🚴‍♂️', '🚴‍♂'), # FE0F in emoji + # Test ignored characters + ('a\u00ad', NormalizableSequenceType.IGNORED, 1, '\u00ad', ''), # Soft hyphen is ignored + # Test FE0F as ignored + ('a\ufe0f', NormalizableSequenceType.IGNORED, 1, '\ufe0f', ''), # FE0F by itself is ignored + ], +) +def test_normalizations_with_ignored(input_str, expected_type, expected_index, expected_sequence, expected_suggested): + normalizations = ens_normalizations(input_str) + assert len(normalizations) > 0 + e = normalizations[0] # Get first normalization + assert e.type == expected_type + assert e.index == expected_index + assert e.sequence == expected_sequence + assert e.suggested == expected_suggested