@@ -32,17 +32,27 @@ class CompletionContext
3232 protected $ charIndex = 0 ;
3333
3434 /**
35- * An array containing the individual words in the current command line.
35+ * An array of the individual words in the current command line.
3636 *
3737 * This is not set until $this->splitCommand() is called, when it is populated by
3838 * $commandLine exploded by $wordBreaks
3939 *
4040 * Bash equivalent: COMP_WORDS
4141 *
42- * @var array |null
42+ * @var string[] |null
4343 */
4444 protected $ words = null ;
4545
46+ /**
47+ * Words from the currently command-line before quotes and escaping is processed
48+ *
49+ * This is indexed the same as $this->words, but in their raw input terms are in their input form, including
50+ * quotes and escaping.
51+ *
52+ * @var string[]|null
53+ */
54+ protected $ rawWords = null ;
55+
4656 /**
4757 * The index in $this->words containing the word at the current cursor position.
4858 *
@@ -61,7 +71,7 @@ class CompletionContext
6171 *
6272 * @var string
6373 */
64- protected $ wordBreaks = "' \" () = \t\n" ;
74+ protected $ wordBreaks = "= \t\n" ;
6575
6676 /**
6777 * Set the whole contents of the command line as a string
@@ -101,6 +111,22 @@ public function getCurrentWord()
101111 return '' ;
102112 }
103113
114+ /**
115+ * Return the unprocessed string for the word under the cursor
116+ *
117+ * This preserves any quotes and escaping that are present in the input command line.
118+ *
119+ * @return string
120+ */
121+ public function getRawCurrentWord ()
122+ {
123+ if (isset ($ this ->rawWords [$ this ->wordIndex ])) {
124+ return $ this ->rawWords [$ this ->wordIndex ];
125+ }
126+
127+ return '' ;
128+ }
129+
104130 /**
105131 * Return a word by index from the command line
106132 *
@@ -132,6 +158,22 @@ public function getWords()
132158 return $ this ->words ;
133159 }
134160
161+ /**
162+ * Get the unprocessed/literal words from the command line
163+ *
164+ * This is indexed the same as getWords(), but preserves any quoting and escaping from the command line
165+ *
166+ * @return string[]
167+ */
168+ public function getRawWords ()
169+ {
170+ if ($ this ->rawWords === null ) {
171+ $ this ->splitCommand ();
172+ }
173+
174+ return $ this ->rawWords ;
175+ }
176+
135177 /**
136178 * Get the index of the word the cursor is currently in
137179 *
@@ -178,12 +220,15 @@ public function setCharIndex($index)
178220 * This defaults to a sane value based on BASH's word break characters and shouldn't
179221 * need to be changed unless your completions contain the default word break characters.
180222 *
223+ * @deprecated This is becoming an internal setting that doesn't make sense to expose publicly.
224+ *
181225 * @see wordBreaks
182226 * @param string $charList - a single string containing all of the characters to break words on
183227 */
184228 public function setWordBreaks ($ charList )
185229 {
186- $ this ->wordBreaks = $ charList ;
230+ // Drop quotes from break characters - strings are handled separately to word breaks now
231+ $ this ->wordBreaks = str_replace (array ('" ' , '\'' ), '' , $ charList );;
187232 $ this ->reset ();
188233 }
189234
@@ -194,57 +239,146 @@ public function setWordBreaks($charList)
194239 */
195240 protected function splitCommand ()
196241 {
197- $ this ->words = array ();
198- $ this ->wordIndex = null ;
199- $ cursor = 0 ;
242+ $ tokens = $ this ->tokenizeString ($ this ->commandLine );
200243
201- $ breaks = preg_quote ($ this ->wordBreaks );
202-
203- if (!preg_match_all ("/([^ $ breaks]*)([ $ breaks]*)/ " , $ this ->commandLine , $ matches )) {
204- return ;
205- }
206-
207- // Groups:
208- // 1: Word
209- // 2: Break characters
210- foreach ($ matches [0 ] as $ index => $ wholeMatch ) {
211- // Determine which word the cursor is in
212- $ cursor += strlen ($ wholeMatch );
213- $ word = $ matches [1 ][$ index ];
214- $ breaks = $ matches [2 ][$ index ];
215-
216- if ($ this ->wordIndex === null && $ cursor >= $ this ->charIndex ) {
217- $ this ->wordIndex = $ index ;
218-
219- // Find the user's cursor position relative to the end of this word
220- // The end of the word is the internal cursor minus any break characters that were captured
221- $ cursorWordOffset = $ this ->charIndex - ($ cursor - strlen ($ breaks ));
244+ foreach ($ tokens as $ token ) {
245+ if ($ token ['type ' ] != 'break ' ) {
246+ $ this ->words [] = $ this ->getTokenValue ($ token );
247+ $ this ->rawWords [] = $ token ['value ' ];
248+ }
222249
223- if ($ cursorWordOffset < 0 ) {
224- // Cursor is inside the word - truncate the word at the cursor
225- // (This emulates normal BASH completion behaviour I've observed, though I'm not entirely sure if it's useful)
226- $ word = substr ($ word , 0 , strlen ($ word ) + $ cursorWordOffset );
250+ // Determine which word index the cursor is inside once we reach it's offset
251+ if ($ this ->wordIndex === null && $ this ->charIndex <= $ token ['offsetEnd ' ]) {
252+ $ this ->wordIndex = count ($ this ->words ) - 1 ;
227253
228- } elseif ( $ cursorWordOffset > 0 ) {
254+ if ( $ token [ ' type ' ] == ' break ' ) {
229255 // Cursor is in the break-space after a word
230256 // Push an empty word at the cursor to allow completion of new terms at the cursor, ignoring words ahead
231257 $ this ->wordIndex ++;
232- $ this ->words [] = $ word ;
233258 $ this ->words [] = '' ;
259+ $ this ->rawWords [] = '' ;
234260 continue ;
235261 }
236- }
237262
238- if ($ word !== '' ) {
239- $ this ->words [] = $ word ;
263+ if ($ this ->charIndex < $ token ['offsetEnd ' ]) {
264+ // Cursor is inside the current word - truncate the word at the cursor to complete on
265+ // This emulates BASH completion's behaviour with COMP_CWORD
266+
267+ // Create a copy of the token with its value truncated
268+ $ truncatedToken = $ token ;
269+ $ relativeOffset = $ this ->charIndex - $ token ['offset ' ];
270+ $ truncatedToken ['value ' ] = substr ($ token ['value ' ], 0 , $ relativeOffset );
271+
272+ // Replace the current word with the truncated value
273+ $ this ->words [$ this ->wordIndex ] = $ this ->getTokenValue ($ truncatedToken );
274+ $ this ->rawWords [$ this ->wordIndex ] = $ truncatedToken ['value ' ];
275+ }
240276 }
241277 }
242278
243- if ($ this ->wordIndex > count ($ this ->words ) - 1 ) {
244- $ this ->wordIndex = count ($ this ->words ) - 1 ;
279+ // Cursor position is past the end of the command line string - consider it a new word
280+ if ($ this ->wordIndex === null ) {
281+ $ this ->wordIndex = count ($ this ->words );
282+ $ this ->words [] = '' ;
283+ $ this ->rawWords [] = '' ;
245284 }
246285 }
247286
287+ /**
288+ * Return a token's value with escaping and quotes removed
289+ *
290+ * @see self::tokenizeString()
291+ * @param array $token
292+ * @return string
293+ */
294+ protected function getTokenValue ($ token )
295+ {
296+ $ value = $ token ['value ' ];
297+
298+ // Remove outer quote characters (or first quote if unclosed)
299+ if ($ token ['type ' ] == 'quoted ' ) {
300+ $ value = preg_replace ('/^(?:[ \'"])(.*?)(?:[ \'"])?$/ ' , '$1 ' , $ value );
301+ }
302+
303+ // Remove escape characters
304+ $ value = preg_replace ('/ \\\\(.)/ ' , '$1 ' , $ value );
305+
306+ return $ value ;
307+ }
308+
309+ /**
310+ * Break a string into words, quoted strings and non-words (breaks)
311+ *
312+ * Returns an array of unmodified segments of $string with offset and type information.
313+ *
314+ * @param string $string
315+ * @return array as [ [type => string, value => string, offset => int], ... ]
316+ */
317+ protected function tokenizeString ($ string )
318+ {
319+ // Map capture groups to returned token type
320+ $ typeMap = array (
321+ 'double_quote_string ' => 'quoted ' ,
322+ 'single_quote_string ' => 'quoted ' ,
323+ 'word ' => 'word ' ,
324+ 'break ' => 'break ' ,
325+ );
326+
327+ // Escape every word break character including whitespace
328+ // preg_quote won't work here as it doesn't understand the ignore whitespace flag ("x")
329+ $ breaks = preg_replace ('/(.)/ ' , '\\\$1 ' , $ this ->wordBreaks );
330+
331+ $ pattern = <<<"REGEX"
332+ /(?:
333+ (?P<double_quote_string>
334+ "(\\\\.|[^ \"\\\\])*(?:"|$)
335+ ) |
336+ (?P<single_quote_string>
337+ '(\\\\.|[^'\\\\])*(?:'|$)
338+ ) |
339+ (?P<word>
340+ (?:\\\\.|[^ $ breaks])+
341+ ) |
342+ (?P<break>
343+ [ $ breaks]+
344+ )
345+ )/x
346+ REGEX ;
347+
348+ $ tokens = array ();
349+
350+ if (!preg_match_all ($ pattern , $ string , $ matches , PREG_OFFSET_CAPTURE | PREG_SET_ORDER )) {
351+ return $ tokens ;
352+ }
353+
354+ foreach ($ matches as $ set ) {
355+ foreach ($ set as $ groupName => $ match ) {
356+
357+ // Ignore integer indices preg_match outputs (duplicates of named groups)
358+ if (is_integer ($ groupName )) {
359+ continue ;
360+ }
361+
362+ // Skip if the offset indicates this group didn't match
363+ if ($ match [1 ] === -1 ) {
364+ continue ;
365+ }
366+
367+ $ tokens [] = array (
368+ 'type ' => $ typeMap [$ groupName ],
369+ 'value ' => $ match [0 ],
370+ 'offset ' => $ match [1 ],
371+ 'offsetEnd ' => $ match [1 ] + strlen ($ match [0 ])
372+ );
373+
374+ // Move to the next set (only one group should match per set)
375+ continue ;
376+ }
377+ }
378+
379+ return $ tokens ;
380+ }
381+
248382 /**
249383 * Reset the computed words so that $this->splitWords is forced to run again
250384 */
0 commit comments