From 35d744efebdd8c694d5fae8a00c866a533959d99 Mon Sep 17 00:00:00 2001 From: alesshosry Date: Thu, 26 Feb 2026 14:26:49 +0100 Subject: [PATCH] Now it is calling #positionFromTSPoint:usingEncoding: which is in turn calculating positions based on bytes. As a result, tsLines, tsLinesDo: and tsLineIndicesDo: are removed cz they are not used anymore This is linked to issue #21 --- .../TSHighlighter.class.st | 2 +- src/TreeSitter/String.extension.st | 109 +++++------------- src/TreeSitter/TSLibrary.class.st | 12 +- src/TreeSitter/TSParser.class.st | 18 +-- 4 files changed, 45 insertions(+), 96 deletions(-) diff --git a/src/TreeSitter-Highlighter/TSHighlighter.class.st b/src/TreeSitter-Highlighter/TSHighlighter.class.st index 3ac18d7..794b4af 100644 --- a/src/TreeSitter-Highlighter/TSHighlighter.class.st +++ b/src/TreeSitter-Highlighter/TSHighlighter.class.st @@ -53,7 +53,7 @@ TSHighlighter >> highlight: aString [ text := aString asText. string := aString. tree := self parser parseString: aString. - self highlight: text usingNode: tree rootNode. + self highlight: text withPlatformLineEndings usingNode: tree rootNode. ^ text ] diff --git a/src/TreeSitter/String.extension.st b/src/TreeSitter/String.extension.st index 5c3ac45..ff4fea1 100644 --- a/src/TreeSitter/String.extension.st +++ b/src/TreeSitter/String.extension.st @@ -2,89 +2,38 @@ Extension { #name : 'String' } { #category : '*TreeSitter' } String >> positionFromTSPoint: aTSPoint [ - "Use me to convert a TSPoint to the position in the original string" - - "Ideally this method (and #ts* methods on String) should be removed because it does not manage the encoding. On top of that, they are too slow because they do too much things while we could directly manipulate a ByteArray or BinaryStream and use #startByte and #endByte to read or get the positions." - - | sum currentLine lines | - sum := aTSPoint column. - currentLine := 0. - lines := self tsLines. - [ currentLine < aTSPoint row ] whileTrue: [ - sum := sum + (lines at: currentLine + 1) size. - currentLine := currentLine + 1 ]. - ^ sum -] - -{ #category : '*TreeSitter' } -String >> tsLineIndicesDo: aBlock [ - "execute aBlock with 3 arguments for each line: - - start index of line - - end index of line without line delimiter - - end index of line including line delimiter(s) CR, LF or CRLF" - - - "Ideally this method (and #ts* methods on String) should be removed because it does not manage the encoding. On top of that, they are too slow because they do too much things while we could directly manipulate a ByteArray or BinaryStream and use #startByte and #endByte to read or get the positions." - - | cr lf start sz nextLF nextCR | - start := 1. - sz := self size. - cr := Character cr. - nextCR := self indexOf: cr startingAt: 1. - lf := Character lf. - nextLF := self indexOf: lf startingAt: 1. - sz = 0 - ifTrue: [ aBlock value: sz value: sz value: sz. - ^ self ]. - [ start <= sz ] - whileTrue: [ (nextLF = 0 and: [ nextCR = 0 ]) - ifTrue: [ "No more CR, nor LF, the string is over" - aBlock value: start value: sz value: sz. - ^ self ]. - (nextCR = 0 or: [ 0 < nextLF and: [ nextLF < nextCR ] ]) - ifTrue: [ "Found a LF" - aBlock value: start value: nextLF - 1 value: nextLF. - start := 1 + nextLF. - nextLF := self indexOf: lf startingAt: start ] - ifFalse: [ 1 + nextCR = nextLF - ifTrue: [ "Found a CR-LF pair" - aBlock value: start value: nextCR - 1 value: nextLF. - start := 1 + nextLF. - nextCR := self indexOf: cr startingAt: start. - nextLF := self indexOf: lf startingAt: start ] - ifFalse: [ "Found a CR" - aBlock value: start value: nextCR - 1 value: nextCR. - start := 1 + nextCR. - nextCR := self indexOf: cr startingAt: start ] ] ]. - aBlock value: start value: sz value: sz + + ^ self positionFromTSPoint: aTSPoint usingEncoding: #utf8 ] { #category : '*TreeSitter' } -String >> tsLines [ - "Same as lines but empty line exist" +String >> positionFromTSPoint: aTSPoint usingEncoding: anEncoding [ + "This method is used to convert a TSPoint to the position in the original string" + "It is specifically used in the TSHighliter to make it compatible with inspectionFASTSourceCode: of FASTEntity" - "Ideally this method (and #ts* methods on String) should be removed because it does not manage the encoding. On top of that, they are too slow because they do too much things while we could directly manipulate a ByteArray or BinaryStream and use #startByte and #endByte to read or get the positions." - - ^ Array - new: (self size // 60 max: 16) - streamContents: [ :lines | - self tsLinesDo: [ :aLine | lines nextPut: aLine ] ] -] - -{ #category : '*TreeSitter' } -String >> tsLinesDo: aBlock [ - "Same as linesDo but empty line exist" - - - "Ideally this method (and #ts* methods on String) should be removed because it does not manage the encoding. On top of that, they are too slow because they do too much things while we could directly manipulate a ByteArray or BinaryStream and use #startByte and #endByte to read or get the positions." - - self - tsLineIndicesDo: [ :start :endWithoutDelimiters :end | - | begin | - "endWithoutDelimiters = start - ifTrue: [ aBlock value: '' ] - ifFalse: [" - begin := (start = 0) ifTrue: [ 1 ] ifFalse: [ start ]. - aBlock value: (self copyFrom: begin to: end) "]" ] + | bytes currentRow index | + + bytes := self encodeWith: anEncoding. "converting cod e to bytes;" + + currentRow := 0. + index := 1. + + [ currentRow < aTSPoint row ] whileTrue: [ + index > bytes size ifTrue: [ + self error: 'Row exceeds number of lines' + ]. + + (bytes at: index) = 10 ifTrue: [ "apparently 10 is the byte value of \n in UTF-8 (and ASCII); but this is risky if the encoding is not utf8" + currentRow := currentRow + 1 + ]. + + index := index + 1. + ]. + + (index - 1 + aTSPoint column) > bytes size ifTrue: [ + self error: 'Column exceeds line length' + ]. + + ^ index - 1 + aTSPoint column ] diff --git a/src/TreeSitter/TSLibrary.class.st b/src/TreeSitter/TSLibrary.class.st index 68e638b..29cef42 100644 --- a/src/TreeSitter/TSLibrary.class.st +++ b/src/TreeSitter/TSLibrary.class.st @@ -208,17 +208,17 @@ TSLibrary >> ts_parser: aParser _parse_string: aString ofLength: length usingOld ] { #category : 'parser' } -TSLibrary >> ts_parser: aTSParser _print_dot_graphs: fd [ - +TSLibrary >> ts_parser: aParser _parse_string: aString ofLength: length usingOldTree: anOldTree encoding: anEncoding [ + ^ self ffiCall: - 'void ts_parser_print_dot_graphs (TSParser * aTSParser, int fd )' + 'TSTree *ts_parser_parse_string_encoding(TSParser * aParser, const TSTree * anOldTree, const char * aString, uint32 length, TSInputEncoding anEncoding)' ] { #category : 'parser' } -TSLibrary >> ts_parser: aParser _parse_string: aString ofLength: length usingOldTree: anOldTree encoding: anEncoding [ - +TSLibrary >> ts_parser: aTSParser _print_dot_graphs: fd [ + ^ self ffiCall: - 'TSTree *ts_parser_parse_string_encoding(TSParser * aParser, const TSTree * anOldTree, const char * aString, uint32 length, TSInputEncoding anEncoding)' + 'void ts_parser_print_dot_graphs (TSParser * aTSParser, int fd )' ] { #category : 'parser' } diff --git a/src/TreeSitter/TSParser.class.st b/src/TreeSitter/TSParser.class.st index ddc1497..9cd768c 100644 --- a/src/TreeSitter/TSParser.class.st +++ b/src/TreeSitter/TSParser.class.st @@ -126,15 +126,6 @@ TSParser >> parseString: aString usingTree: aTree pharoEncoding: anEncoding [ usingOldTree: aTree ] -{ #category : 'parsing' } -TSParser >> printDotGraphTo: aFileDescriptor [ - "-1 for no" - - ^ TSLibrary uniqueInstance - ts_parser: self - _print_dot_graphs: aFileDescriptor -] - { #category : 'parsing' } TSParser >> parseString: aString usingTree: aTree tsEncoding: anEncoding [ @@ -150,6 +141,15 @@ TSParser >> parseString: aString usingTree: aTree tsEncoding: anEncoding [ encoding: anEncoding ] +{ #category : 'parsing' } +TSParser >> printDotGraphTo: aFileDescriptor [ + "-1 for no" + + ^ TSLibrary uniqueInstance + ts_parser: self + _print_dot_graphs: aFileDescriptor +] + { #category : 'initialization' } TSParser >> reset [