@@ -105,225 +105,101 @@ int swap_int32(int x) {
105105 */ 
106106char  * utf8_unicode_inplace_ex (apr_pool_t  * mp , unsigned char   * input , long int   input_len , int  * changed ) {
107107    int  unicode_len  =  0 , length  =  0 ;
108-     unsigned int   d  =  0 ,  count   =   0 ;
108+     unsigned int   d  =  0 ;
109109    unsigned char   c , * utf ;
110110    char  * rval , * data ;
111111    unsigned int   i , len , j ;
112112    unsigned int   bytes_left  =  input_len ;
113113    unsigned char   * unicode  =  NULL ;
114114
115+     if  (input  ==  NULL ) return  NULL ;
116+ 
115117    * changed  =  0 ;
116118    /* RFC3629 states that UTF-8 are encoded using sequences of 1 to 4 octets. */ 
117-     /* Max size per character should fit in 4 bytes */ 
118-     len  =  input_len  *  4  +  1 ;
119+     /* Max size per character should fit in 4 bytes (%u01020304)  */ 
120+     len  =  input_len  *  10  +  1 ;
119121    data  =  rval  =  apr_palloc (mp , len );
120122    if  (rval  ==  NULL ) return  NULL ;
121123
122- 
123-     if  (input  ==  NULL ) return  NULL ;
124- 
125-     for (i  =  0 ; i  <  bytes_left ;)  {
124+     for  (i  =  0 ; i  <  bytes_left ;) {
126125        unicode_len  =  0 ; d  =  0 ;
127126        utf  =  (unsigned char   * )& input [i ];
128- 
129127        c  =  * utf ;
130128
131-         /* If first byte begins with binary 0 it is  single byte encoding */ 
129+         /* If first byte begins with binary 0 it may be  single byte encoding */ 
132130        if  ((c  &  0x80 ) ==  0 ) {
133-             /* single byte unicode (7 bit ASCII equivilent) has no validation */ 
134-             count ++ ;
135-             if (count  <= len )    {
136-                 if (c  ==  0 )
137-                     * data  =  x2c (& c );
138-                 else 
139-                     * data ++  =  c ;
131+             if  (c  ==  0 ) {
132+                 unicode_len  =  2 ;
133+                 d  =  utf [1 ];
140134            }
141- 
142135        }
143136        /* If first byte begins with binary 110 it is two byte encoding*/ 
144137        else  if  ((c  &  0xE0 ) ==  0xC0 ) {
145138            /* check we have at least two bytes */ 
146139            if  (bytes_left  <  2 ) unicode_len  =  UNICODE_ERROR_CHARACTERS_MISSING ;
147140            /* check second byte starts with binary 10 */ 
148-             else  if  ((( * ( utf   +   1 ))  &  0xC0 ) !=  0x80 ) unicode_len  =  UNICODE_ERROR_INVALID_ENCODING ;
141+             else  if  ((utf [ 1 ]  &  0xC0 ) !=  0x80 ) unicode_len  =  UNICODE_ERROR_INVALID_ENCODING ;
149142            else  {
150143                unicode_len  =  2 ;
151-                 count += 6 ;
152-                 if (count  <= len ) {
153-                     /* compute character number */ 
154-                     d  =  ((c  &  0x1F ) << 6 ) | (* (utf  +  1 ) &  0x3F );
155-                     * data ++  =  '%' ;
156-                     * data ++  =  'u' ;
157-                     unicode  =  apr_psprintf (mp , "%x" , d );
158-                     length  =  strlen (unicode );
159- 
160-                     switch (length )  {
161-                         case  1 :
162-                             * data ++  =  '0' ;
163-                             * data ++  =  '0' ;
164-                             * data ++  =  '0' ;
165-                             break ;
166-                         case  2 :
167-                             * data ++  =  '0' ;
168-                             * data ++  =  '0' ;
169-                             break ;
170-                         case  3 :
171-                             * data ++  =  '0' ;
172-                             break ;
173-                         case  4 :
174-                         case  5 :
175-                             break ;
176-                     }
177- 
178-                     for (j = 0 ; j < length ; j ++ ) {
179-                         * data ++  =  unicode [j ];
180-                     }
181- 
182-                     * changed  =  1 ;
183-                 }
144+                 /* compute character number */ 
145+                 d  =  ((c  &  0x1F ) << 6 ) | (utf [1 ] &  0x3F );
184146            }
185147        }
186148        /* If first byte begins with binary 1110 it is three byte encoding */ 
187149        else  if  ((c  &  0xF0 ) ==  0xE0 ) {
188150            /* check we have at least three bytes */ 
189151            if  (bytes_left  <  3 ) unicode_len  =  UNICODE_ERROR_CHARACTERS_MISSING ;
190152            /* check second byte starts with binary 10 */ 
191-             else  if  ((( * ( utf   +   1 ))  &  0xC0 ) !=  0x80 ) unicode_len  =  UNICODE_ERROR_INVALID_ENCODING ;
153+             else  if  ((utf [ 1 ]  &  0xC0 ) !=  0x80 ) unicode_len  =  UNICODE_ERROR_INVALID_ENCODING ;
192154            /* check third byte starts with binary 10 */ 
193155            else  if  (((* (utf  +  2 )) &  0xC0 ) !=  0x80 ) unicode_len  =  UNICODE_ERROR_INVALID_ENCODING ;
194156            else  {
195157                unicode_len  =  3 ;
196-                 count += 6 ;
197-                 if (count  <= len ) {
198-                     /* compute character number */ 
199-                     d  =  ((c  &  0x0F ) << 12 ) | ((* (utf  +  1 ) &  0x3F ) << 6 ) | (* (utf  +  2 ) &  0x3F );
200-                     * data ++  =  '%' ;
201-                     * data ++  =  'u' ;
202-                     unicode  =  apr_psprintf (mp , "%x" , d );
203-                     length  =  strlen (unicode );
204- 
205-                     switch (length )  {
206-                         case  1 :
207-                             * data ++  =  '0' ;
208-                             * data ++  =  '0' ;
209-                             * data ++  =  '0' ;
210-                             break ;
211-                         case  2 :
212-                             * data ++  =  '0' ;
213-                             * data ++  =  '0' ;
214-                             break ;
215-                         case  3 :
216-                             * data ++  =  '0' ;
217-                             break ;
218-                         case  4 :
219-                         case  5 :
220-                             break ;
221-                     }
222- 
223-                     for (j = 0 ; j < length ; j ++ ) {
224-                         * data ++  =  unicode [j ];
225-                     }
226- 
227-                     * changed  =  1 ;
228- 
229-                 }
158+                 /* compute character number */ 
159+                 d  =  ((c  &  0x0F ) << 12 ) | ((utf [1 ] &  0x3F ) << 6 ) | (* (utf  +  2 ) &  0x3F );
230160            }
231161        }
232162        /* If first byte begins with binary 11110 it is four byte encoding */ 
233163        else  if  ((c  &  0xF8 ) ==  0xF0 ) {
234164            /* restrict characters to UTF-8 range (U+0000 - U+10FFFF)*/ 
235-             if  (c  >= 0xF5 ) {
236-                 * data ++  =  c ;
237-             }
165+             if  (c  >= 0xF5 ) unicode_len  =  UNICODE_ERROR_RESTRICTED_CHARACTER ;
238166            /* check we have at least four bytes */ 
239-             if  (bytes_left  <  4 ) unicode_len  =  UNICODE_ERROR_CHARACTERS_MISSING ;
167+             else   if  (bytes_left  <  4 ) unicode_len  =  UNICODE_ERROR_CHARACTERS_MISSING ;
240168            /* check second byte starts with binary 10 */ 
241-             else  if  ((( * ( utf   +   1 ))  &  0xC0 ) !=  0x80 ) unicode_len  =  UNICODE_ERROR_INVALID_ENCODING ;
169+             else  if  ((utf [ 1 ]  &  0xC0 ) !=  0x80 ) unicode_len  =  UNICODE_ERROR_INVALID_ENCODING ;
242170            /* check third byte starts with binary 10 */ 
243171            else  if  (((* (utf  +  2 )) &  0xC0 ) !=  0x80 ) unicode_len  =  UNICODE_ERROR_INVALID_ENCODING ;
244172            /* check forth byte starts with binary 10 */ 
245173            else  if  (((* (utf  +  3 )) &  0xC0 ) !=  0x80 ) unicode_len  =  UNICODE_ERROR_INVALID_ENCODING ;
246174            else  {
247175                unicode_len  =  4 ;
248-                 count += 7 ;
249-                 if (count  <= len ) {
250-                     /* compute character number */ 
251-                     d  =  ((c  &  0x07 ) << 18 ) | ((* (utf  +  1 ) &  0x3F ) << 12 ) | ((* (utf  +  2 ) &  0x3F ) << 6 ) | (* (utf  +  3 ) &  0x3F );
252-                     * data ++  =  '%' ;
253-                     * data ++  =  'u' ;
254-                     unicode  =  apr_psprintf (mp , "%x" , d );
255-                     length  =  strlen (unicode );
256- 
257-                     switch (length )  {
258-                         case  1 :
259-                             * data ++  =  '0' ;
260-                             * data ++  =  '0' ;
261-                             * data ++  =  '0' ;
262-                             break ;
263-                         case  2 :
264-                             * data ++  =  '0' ;
265-                             * data ++  =  '0' ;
266-                             break ;
267-                         case  3 :
268-                             * data ++  =  '0' ;
269-                             break ;
270-                         case  4 :
271-                         case  5 :
272-                             break ;
273-                     }
274- 
275-                     for (j = 0 ; j < length ; j ++ ) {
276-                         * data ++  =  unicode [j ];
277-                     }
278- 
279-                     * changed  =  1 ;
280- 
281-                 }
176+                 /* compute character number */ 
177+                 d  =  ((c  &  0x07 ) << 18 ) | ((utf [1 ] &  0x3F ) << 12 ) | ((* (utf  +  2 ) &  0x3F ) << 6 ) | (* (utf  +  3 ) &  0x3F );
282178            }
283179        }
284-         /* any other first byte is invalid (RFC 3629) */ 
285-         else  {
286-             count ++ ;
287-             if (count  <= len )
288-                 * data ++  =  c ;
289-         }
290- 
291180        /* invalid UTF-8 character number range (RFC 3629) */ 
292-         if  ((d  >= 0xD800 ) &&  (d  <= 0xDFFF )) {
293-             count ++ ;
294-             if (count  <= len )
295-                 * data ++  =  c ;
296-         }
297- 
181+         if  ((d  >= 0xD800 ) &&  (d  <= 0xDFFF )) unicode_len  =  UNICODE_ERROR_RESTRICTED_CHARACTER ;
298182        /* check for overlong */ 
299-         if  ((unicode_len  ==  4 ) &&  (d  <  0x010000 )) {
300-             /* four byte could be represented with less bytes */ 
301-             count ++ ;
302-             if (count  <= len )
303-                 * data ++  =  c ;
304-         }
305-         else  if  ((unicode_len  ==  3 ) &&  (d  <  0x0800 )) {
306-             /* three byte could be represented with less bytes */ 
307-             count ++ ;
308-             if (count  <= len )
309-                 * data ++  =  c ;
310-         }
311-         else  if  ((unicode_len  ==  2 ) &&  (d  <  0x80 )) {
312-             /* two byte could be represented with less bytes */ 
313-             count ++ ;
314-             if (count  <= len )
315-                 * data ++  =  c ;
316-         }
183+         if  ((unicode_len  ==  4 ) &&  (d  <  0x010000 )) unicode_len  =  UNICODE_ERROR_OVERLONG_CHARACTER ;
184+         /* three byte could be represented with less bytes */ 
185+         if  ((unicode_len  ==  3 ) &&  (d  <  0x0800 )) unicode_len  =  UNICODE_ERROR_OVERLONG_CHARACTER ;
186+         /* two byte could be represented with less bytes */ 
187+         if  ((unicode_len  ==  2 ) &&  (d  <  0x80 )) unicode_len  =  UNICODE_ERROR_OVERLONG_CHARACTER ;
317188
318-         if (unicode_len  >  0 ) {
189+         if   (unicode_len  >  0 ) {
319190            i  +=  unicode_len ;
320-         } else  {
191+             sprintf (data , "%%u%04x" , d );
192+             data  +=  6 ;
193+             * changed  =  1 ;
194+         }
195+         else  {
196+             /* any other first byte is invalid (RFC 3629), so assume it's an ASCII character */ 
197+             * data ++  =  c ;
321198            i ++ ;
322199        }
323200    }
324201
325-     * data  = '\0' ;
326- 
202+     * data  =  '\0' ;
327203    return  rval ;
328204}
329205
0 commit comments