diff --git a/VOTable.tex b/VOTable.tex
index 16d97c5..287cef8 100644
--- a/VOTable.tex
+++ b/VOTable.tex
@@ -224,16 +224,16 @@ \subsection{XML Conventions}
data. Examples are:
\begin{verbatim}
-François
+François
\end{verbatim}
-In the first example, the sequence {\tt \&\#231;} is interpreted as
-part of the ISO/IEC 10646 character set (Unicode), and translates to an
-accented character, so that the text is ``Fran\c{c}ois".
+In the first example, the sequence {\tt \&\#xE7;} is interpreted as
+a character entity reference for the Unicode code point U+00E7
+and translates to an accented character, so that the text is ``Fran\c{c}ois''.
The second example uses the special {\tt CDATA} sequence so that the
characters {\tt <}, {\tt >}, and {\tt\&} can be used without interpretation;
-in this case, any ASCII characters are allowed except the terminating
+in this case, any characters are allowed except the terminating
sequence {\tt]]>}. For more information, see any book on
XML.
@@ -362,35 +362,22 @@ \section{Data Model}
\subsection{Primitives}
\begin{table}[hbt]
-\begin{center}\begin{tabular}{|r|l|c|r|}
+\begin{center}\begin{tabular}{|r|l|c|r|l|}
\hline
- {\attr{datatype}} & Meaning & \attr{FITS} &
- { Bytes} \\
+ {\attr{datatype}} & Meaning & \attr{FITS} & { Bytes} & Notes \\
\hline
- \literalvalue{boolean} & Logical &\literalvalue{L}& 1 \\
- \literalvalue{bit} & Bit &\literalvalue{X}& * \\
- \literalvalue{unsignedByte} & Byte (0 to 255) &\literalvalue{B}& 1 \\
- \literalvalue{short} & Short Integer &\literalvalue{I}& 2 \\
- \literalvalue{int} & Integer &\literalvalue{J}& 4 \\
- \literalvalue{long} & Long integer &\literalvalue{K}& 8 \\
- \literalvalue{char} & ASCII Character &\literalvalue{A}& 1 \\
- \literalvalue{unicodeChar} & Unicode Character& & 2 \\
- \literalvalue{float} & Floating point &\literalvalue{E}& 4 \\
- \literalvalue{double} & Double &\literalvalue{D}& 8 \\
- \literalvalue{floatComplex} & Float Complex &\literalvalue{C}& 8 \\
- \literalvalue{doubleComplex}& Double Complex &\literalvalue{M}& 16 \\
- %logical & 1 \\
- %bit & * \\
- %byte & 1\\
- %short & 2 \\
- %int & 4 \\
- %long & 8 \\
- %char & 1 \\
- %unicodeChar & 2 \\
- %float & 4 \\
- %double & 8 \\
- %floatComplex & 8 \\
- %doubleComplex & 16 \\
+ \literalvalue{boolean} & Logical &\literalvalue{L}& 1 & \\
+ \literalvalue{bit} & Bit &\literalvalue{X}& * & \\
+ \literalvalue{unsignedByte} & Byte (0 to 255) &\literalvalue{B}& 1 & \\
+ \literalvalue{short} & Short Integer &\literalvalue{I}& 2 & \\
+ \literalvalue{int} & Integer &\literalvalue{J}& 4 & \\
+ \literalvalue{long} & Long integer &\literalvalue{K}& 8 & \\
+ \literalvalue{char} & UTF-8 byte &\literalvalue{A}& 1 & \\
+ \literalvalue{unicodeChar} & UTF-16 code unit & & 2 & deprecated \\
+ \literalvalue{float} & Floating point &\literalvalue{E}& 4 & \\
+ \literalvalue{double} & Double &\literalvalue{D}& 8 & \\
+ \literalvalue{floatComplex} & Float Complex &\literalvalue{C}& 8 & \\
+ \literalvalue{doubleComplex}& Double Complex &\literalvalue{M}& 16 & \\
\hline\end{tabular}\end{center}
\caption{\label{primitives}List of the Primitives
{\em(details in \Aref{sec:datatypes})}}\end{table}
@@ -409,16 +396,37 @@ \subsection{Primitives}
part of $(b+7)/8$ bytes). These primitives
are described in more detail in \Aref{sec:datatypes}.
-VOTables support two kinds of characters: ASCII 1-byte characters
-and Unicode (UCS-2) 2-byte characters. Unicode is a way to represent
-characters that is an alternative to ASCII. It uses two bytes per
-character instead of one, it is strongly supported by XML tools, and
-it can handle a large variety of international alphabets. Therefore
-VOTable supports not only ASCII strings ({\attrval{datatype}{char}}),
-but also Unicode ({\attrval{datatype}{unicodeChar}}).
+Character and string data should be encoded using the \literalvalue{char}
+type, which from VOTable 1.6 supports Unicode.
+Note that the primitive size of one byte refers to a single
+UTF-8-encoded byte, not to a single character.
+Since UTF-8 is a variable-width encoding,
+a character may require multiple bytes, and for arrays the
+string length (length in characters) and primitive count (length in bytes)
+will in general differ.
+7-bit ASCII characters are however all encoded as a single byte in UTF-8,
+so in the case of ASCII characters, which were required for this
+datatype in earlier VOTable versions, the primitive and character count
+are equal.
+This means that a single (non-array) \literalvalue{char}
+is capable of storing a 7-bit ASCII character only.
+Strings must not be truncated mid-character
+so truncation of a string to fit a fixed-length char array may result in
+unused bytes at the end of the array.
+
+For historical reasons the \literalvalue{unicodeChar} type can also be used
+for character storage, but from VOTable 1.6 this type is deprecated.
+For this type the primitive size of two bytes corresponds to a 2-byte
+UTF-16 {\em code unit}.
+Only characters in the Unicode Basic Multilingual Plane,
+which all have 2-byte representations, are permitted for this datatype,
+so that the primitive count matches the character count.
+This is identical to the obsolete UCS-2 encoding,
+which was the description used in earlier VOTable versions.
Note that strings are not a primitive type: strings are
-represented in VOTable as an array of characters. %in an characters are.
+represented in VOTable as an array of character storage units
+(usually UTF-8 bytes).
\subsection{Columns as Arrays}\label{array}
@@ -456,17 +464,28 @@ \subsection{Columns as Arrays}\label{array}
\elemdef{FIELD}{ \attrval{ID}{thumbs} \attrval{datatype}{unsignedByte}
\attrval{arraysize}{64x64x10*}\slash}
-Strings, which are defined as a set of characters,
+Strings, which are defined as a sequence of characters,
can therefore be represented in VOTable as a fixed- or variable-length
-array of characters:
+array of character elements:
\elemdef{FIELD}{ \attrval{name}{unboundedString} \attrval{datatype}{char}
\attrval{arraysize}{*}\slash}
+Note that the \attr{arraysize} for a \attrval{datatype}{char}
+array corresponds to the storage length,
+that is the number of UTF-8 bytes required to store string values,
+and not necessarily the number of characters in the string.
+So a \elem{FIELD} with \attrval{datatype}{char} and \attrval{arraysize}{4}
+could store the value \literalvalue{LCDM}
+but not \literalvalue{$\Lambda$CDM},
+since the character $\Lambda$ (Lambda)
+is encoded in two bytes (0xCE, 0x9B) by UTF-8
+while the ASCII characters L, C, D, M are encoded in one byte.
+
A 1D array of strings can be represented as a 2D array of characters, but
given the logic above, it is possible to define a variable-length array
-of fixed-length strings,
-but not a fixed-length array of variable-length strings.
+of fixed-storage-length strings,
+but not a fixed-length array of variable-storage-length strings.
A convention to express an array of variable-length strings
exists (see \Aref{sec:arraystring}) but is not
part of this standard.
@@ -1520,7 +1539,7 @@ \subsection{\elem{TABLEDATA} Serialization}
If a cell contains an array of numbers or a complex number,
it should be encoded as multiple numbers separated by
-whitespace. However in the case of character and Unicode strings
+whitespace. However in the case of character strings
(declared in the corresponding \elem{FIELD} as an array of {\em char}
or {\em unicodeChar} datatype), no
separator should exist. Here is an example of a two-row table
@@ -1542,8 +1561,8 @@ \subsection{\elem{TABLEDATA} Serialization}
\end{verbatim}
\endgroup
-The first entry is a fixed-length array of 10 characters; since
-the value being presented ({\tt Apple}) has 5 characters, this
+The first entry is a fixed-length array of 10 UTF-8 bytes; since
+the value being presented ({\tt Apple}) is encoded in 5 bytes, this
is padded with trailing blanks. The second cell is a short integer
but has a null value, as indicated by the empty \elem{TD} element.
The third cell contains a variable-length array of integers.
@@ -1551,8 +1570,9 @@ \subsection{\elem{TABLEDATA} Serialization}
A special notice should be mentioned about the significance of
{\em white space} in a table cell (the term {\em white space}
-designates the characters {\em space} [{\tt{x20}}], {\em tab} [{\tt{x09}}],
-{\em newline} [{\tt{x0a}}], {\em carriage-return} [{\tt{x0d}}]):
+designates the characters {\em space} [{\tt{U+0020}}],
+{\em tab} [{\tt{U+0009}}],
+{\em newline} [{\tt{U+000A}}], {\em carriage-return} [{\tt{U+000D}}]):
while for numeric data types
the amount of white spaces does not matter (the elements
of an array of numbers may for instance be written on several lines),
@@ -1953,44 +1973,42 @@ \section{Definitions of Primitive Datatypes}
\item {\bf Character}\quad If the value of the {\attr{datatype}}
attribute specifies data type {\literalvalue{char}},
-the field shall contain in the \elem{BINARY}/\elem{BINARY2} serialization an ASCII
-(7-bit) character.
+the field shall contain in the \elem{BINARY}/\elem{BINARY2} serialization
+a UTF-8 encoded byte.
The \attr{arraysize} attribute
-indicates a character string composed of ASCII text.
-The \elem{BINARY}/\elem{BINARY2} serialization follows the
-FITS rules for character strings,
-and a character string may therefore be terminated by an ASCII
-NULL [0x00]
+indicates a Unicode string composed of UTF-8 encoded text.
+A string may be terminated by a NULL code point
+(U+0000, encoded as the byte 0x00)
before the length specified in the \attr{arraysize} attribute.
-In this case characters after the first ASCII NULL are not defined,
-and a string having the number of characters identical to
+In this case bytes after the first NULL are ignored,
+and a string having the number of bytes identical to
the \attr{arraysize} value is not NULL terminated.
-Characters should be represented in the \elem{TABLEDATA} serialization
-using the normal rules for encoding XML text:
-the ampersand (\&) can be written \verb+&+ (symbolic representation)
-or \verb+&+ (decimal representation) or
-\verb+&+ (hexadecimal representation); the less-than ({\tt<}) and greater-than ({\tt>}) symbols should be coded \verb+<+ and \verb+>+
-or \verb+<+ and \verb+>+.
+The value MUST represent a legal UTF-8 encoded string,
+and therefore MUST NOT be truncated midway through a multi-byte sequence.
+Characters are represented in the \elem{TABLEDATA} serialization
+using the XML encoding of the VOTable document, which is typically UTF-8.
Also note also the significance of the {\em white space} characters
in the \elem{TABLEDATA} serialization
(\Arefs{elem:TD})
\item {\bf Unicode Character}\quad If the value of the {\attr{datatype}}
attribute specifies data type {\literalvalue{unicodeChar}},
-the field shall contain a Unicode character.
+the field shall contain in the \elem{BINARY}/\elem{BINARY2} serialization
+the 2-byte big-endian UTF-16 encoding
+of a Unicode character from the Basic Multilingual Plane
+(equivalent to the obsolete UCS-2 encoding).
The \attr{arraysize} attribute
-indicates a string composed of Unicode text,
-which enables representation of text in many non-Latin alphabets.
-Each Unicode character is represented in the \elem{BINARY}/\elem{BINARY2} serialization by
-two bytes, using the big-endian UCS-2 encoding (ISO-10646-UCS-2).
-The representation of a Unicode character in the \elem{TABLEDATA} serialization
-follows the XML specifications,
-and e.g. the Cyrillic uppercase ``Ya'' can be written
-\verb+Я+ in UTF-8.
+indicates a string composed of Unicode BMP characters.
+Characters are represented in the \elem{TABLEDATA} serialization
+using the XML encoding of the VOTable document, which is typically UTF-8.
Also note the significance of the {\em white space} characters
in the \elem{TABLEDATA} serialization
-(\Arefs{elem:TD})
-
+(\Arefs{elem:TD}).
+Regardless of serialization, non-BMP characters are not permitted
+by this standard, but readers MAY treat such characters normally
+if encountered, for instance by using a UTF-16 decoder on BINARY data,
+though note in this case the arraysize may no longer match the character count.
+Note this datatype is {\bf deprecated} from VOTable 1.6.
\item {\bf 16-Bit Integer}\quad If the value of the {\attr{datatype}}
attribute specifies datatype {\literalvalue{short}},
@@ -2335,6 +2353,29 @@ \subsection{Differences Between Versions 1.4 and 1.5}
\end{itemize}
\end{itemize}
+
+\subsection{Differences Between Versions 1.5 and 1.6}
+\label{diff1.5-1.6}
+The differences between version 1.6 of VOTable and the preceding
+version 1.5 are:
+
+\begin{itemize}
+\item Unicode characters and strings are properly supported.
+ Elements of the datatype \literalvalue{char} are now defined to
+ contain UTF-8-encoded bytes (not ASCII characters)
+ and elements of the datatype \literalvalue{unicodeChar} are
+ defined to contain UTF-16 2-byte code units for BMP code points
+ (not UCS-2 characters).
+ Both types are represented using document encoding in the
+ \elem{TABLEDATA} serialization.
+ Furthermore the \literalvalue{unicodeChar} type is deprecated.
+ These changes are entirely compatible with earlier VOTable versions
+ (any legal VOTable document of an earlier version
+ will be correctly interpreted by a VOTable 1.6 parser)
+ but enables inclusion of arbitrary Unicode content
+ using the usual UTF-8 encoding.
+\end{itemize}
+
% NOTE: IVOA recommendations must be cited from docrepo.bib
\bibliography{ivoatex/ivoabib,ivoatex/docrepo,localrefs}