Skip to content

Commit 86c22fd

Browse files
eggrobinmarkusicu
andauthored
Propertiness (#1064)
* Disallow and remove duplicates in ExtraPropertyAliases.txt * Add derived status * group by derived status in character.jsp * Try to generate code that compiles * Java floundering * Better formatting * Spotless and logic * That one is enumerated * Remove more redundant ICU-based properties * Actually add the source file * More ICU cleanup * Misclassified * The Emoji properties are properties * Factor conditionals Co-authored-by: Markus Scherer <[email protected]> --------- Co-authored-by: Markus Scherer <[email protected]>
1 parent 7628438 commit 86c22fd

File tree

7 files changed

+1133
-529
lines changed

7 files changed

+1133
-529
lines changed

Diff for: UnicodeJsps/src/main/java/org/unicode/jsp/Common.java

+2-27
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@
77
import com.ibm.icu.text.Normalizer2;
88
import com.ibm.icu.text.StringTransform;
99
import com.ibm.icu.text.Transform;
10-
import com.ibm.icu.text.UTF16;
1110
import com.ibm.icu.text.UnicodeSet;
12-
import com.ibm.icu.util.ULocale;
1311
import java.util.Arrays;
1412
import java.util.List;
1513
import org.unicode.jsp.XPropertyFactory.HanType.HanTypeValues;
@@ -47,18 +45,7 @@ public String transform(String source) {
4745
};
4846

4947
static List<String> XPROPERTY_NAMES =
50-
Arrays.asList(
51-
new String[] {
52-
"toNFC",
53-
"toNFD",
54-
"toNFKC",
55-
"toNFKD",
56-
"toCasefold",
57-
"toLowercase",
58-
"toUppercase",
59-
"toTitlecase",
60-
"subhead"
61-
});
48+
Arrays.asList(new String[] {"toNFC", "toNFD", "toNFKC", "toNFKD", "subhead"});
6249
static final int XSTRING_START = UProperty.STRING_LIMIT;
6350

6451
public static String getXStringPropertyValue(int propertyEnum, int codepoint, int nameChoice) {
@@ -72,14 +59,6 @@ public static String getXStringPropertyValue(int propertyEnum, int codepoint, in
7259
return Common.MyNormalize(codepoint, Normalizer.NFKC);
7360
case Common.TO_NFKD:
7461
return Common.MyNormalize(codepoint, Normalizer.NFKD);
75-
case Common.TO_CASEFOLD:
76-
return UCharacter.foldCase(UTF16.valueOf(codepoint), true);
77-
case Common.TO_LOWERCASE:
78-
return UCharacter.toLowerCase(ULocale.ROOT, UTF16.valueOf(codepoint));
79-
case Common.TO_UPPERCASE:
80-
return UCharacter.toUpperCase(ULocale.ROOT, UTF16.valueOf(codepoint));
81-
case Common.TO_TITLECASE:
82-
return UCharacter.toTitleCase(ULocale.ROOT, UTF16.valueOf(codepoint), null);
8362
case Common.SUBHEAD:
8463
return UnicodeUtilities.getSubheader().getSubheader(codepoint);
8564
}
@@ -120,11 +99,7 @@ static String MyNormalize(String string, Mode mode) {
12099
static final int TO_NFD = UProperty.STRING_LIMIT + 1;
121100
static final int TO_NFKC = UProperty.STRING_LIMIT + 2;
122101
static final int TO_NFKD = UProperty.STRING_LIMIT + 3;
123-
static final int TO_CASEFOLD = UProperty.STRING_LIMIT + 4;
124-
static final int TO_LOWERCASE = UProperty.STRING_LIMIT + 5;
125-
static final int TO_UPPERCASE = UProperty.STRING_LIMIT + 6;
126-
static final int TO_TITLECASE = UProperty.STRING_LIMIT + 7;
127-
public static final int SUBHEAD = TO_TITLECASE + 1;
102+
public static final int SUBHEAD = TO_NFKD + 1;
128103
static final int XSTRING_LIMIT = SUBHEAD + 1;
129104
// static UnicodeSet isCaseFolded = new UnicodeSet();
130105
// static UnicodeSet isLowercase = new UnicodeSet();

Diff for: UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeUtilities.java

+112-55
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151
import org.unicode.idna.IdnaTypes;
5252
import org.unicode.idna.Punycode;
5353
import org.unicode.idna.Uts46;
54+
import org.unicode.props.DerivedPropertyStatus;
55+
import org.unicode.props.UcdProperty;
5456
import org.unicode.props.UcdPropertyValues.Age_Values;
5557
import org.unicode.props.UnicodeProperty;
5658
import org.unicode.props.UnicodeProperty.UnicodeMapProperty;
@@ -1440,6 +1442,42 @@ public static void showProperties(
14401442

14411443
String kRSUnicode = getFactory().getProperty("kRSUnicode").getValue(cp);
14421444
boolean isUnihan = kRSUnicode != null;
1445+
List<UcdProperty> indexedProperties =
1446+
sortedProps.stream()
1447+
.map(UcdProperty::forString)
1448+
.filter(p -> p != null)
1449+
.collect(Collectors.toList());
1450+
List<UcdProperty> ucdProperties =
1451+
indexedProperties.stream()
1452+
.filter(
1453+
p ->
1454+
p.getDerivedStatus() == DerivedPropertyStatus.Approved
1455+
|| p.getDerivedStatus()
1456+
== DerivedPropertyStatus.Provisional)
1457+
.collect(Collectors.toList());
1458+
List<UcdProperty> nonUCDProperties =
1459+
indexedProperties.stream()
1460+
.filter(p -> p.getDerivedStatus() == DerivedPropertyStatus.NonUCDProperty)
1461+
.collect(Collectors.toList());
1462+
List<UcdProperty> ucdNonProperties =
1463+
indexedProperties.stream()
1464+
.filter(p -> p.getDerivedStatus() == DerivedPropertyStatus.UCDNonProperty)
1465+
.collect(Collectors.toList());
1466+
// Non-UCD non-properties, and things added directly in the tools.
1467+
List<String> otherData =
1468+
sortedProps.stream()
1469+
.filter(
1470+
p ->
1471+
UcdProperty.forString(p) == null
1472+
|| UcdProperty.forString(p).getDerivedStatus()
1473+
== DerivedPropertyStatus.NonUCDNonProperty)
1474+
.collect(Collectors.toList());
1475+
1476+
List<UcdProperty> cjkProperties =
1477+
ucdProperties.stream()
1478+
.filter(p -> p.getNames().getShortName().startsWith("cjk"))
1479+
.collect(Collectors.toList());
1480+
ucdProperties.removeIf(p -> p.getNames().getShortName().startsWith("cjk"));
14431481

14441482
Age_Values age = Age_Values.forName(getFactory().getProperty("Age").getValue(cp));
14451483
VersionInfo minVersion =
@@ -1456,71 +1494,86 @@ public static void showProperties(
14561494
+ "</p>");
14571495
}
14581496

1459-
out.append(
1460-
"<table class='propTable'>"
1461-
+ "<caption>"
1462-
+ (isUnihan ? "non-Unihan properties for U+" : "Properties for U+")
1463-
+ hex
1464-
+ "</caption>"
1465-
+ "<tr><th>With Non-Default Values</th><th>With Default Values</th></tr>"
1466-
+ "<tr><td width='50%'>\n");
1467-
out.append("<table width='100%'>\n");
1468-
1469-
List<String> unihanProperties = new ArrayList<>();
14701497
VersionInfo maxVersion =
14711498
showDevProperties ? Settings.LATEST_VERSION_INFO : Settings.LAST_VERSION_INFO;
1472-
for (String propName : sortedProps) {
1473-
UnicodeProperty prop = getFactory().getProperty(propName);
1499+
out.append("<table class='propTable'>");
1500+
showProperties(
1501+
ucdProperties.stream().map(UcdProperty::toString).collect(Collectors.toList()),
1502+
(isUnihan ? "Non-Unihan " : "")
1503+
+ "Normative, Informative, Contributory, and (Provisional) UCD properties for U+"
1504+
+ hex,
1505+
cp,
1506+
minVersion,
1507+
maxVersion,
1508+
showDevProperties,
1509+
out);
1510+
showProperties(
1511+
nonUCDProperties.stream().map(UcdProperty::toString).collect(Collectors.toList()),
1512+
"Non-UCD properties for U+" + hex,
1513+
cp,
1514+
minVersion,
1515+
maxVersion,
1516+
showDevProperties,
1517+
out);
1518+
showProperties(
1519+
ucdNonProperties.stream().map(UcdProperty::toString).collect(Collectors.toList()),
1520+
"Other " + (isUnihan ? "non-Unihan " : "") + "UCD data for U+" + hex,
1521+
cp,
1522+
minVersion,
1523+
maxVersion,
1524+
showDevProperties,
1525+
out);
1526+
if (isUnihan) {
1527+
showProperties(
1528+
cjkProperties.stream().map(UcdProperty::toString).collect(Collectors.toList()),
1529+
"Unihan Normative, Informative, and (Provisional) properties for U+" + hex,
1530+
cp,
1531+
minVersion,
1532+
maxVersion,
1533+
showDevProperties,
1534+
out);
1535+
}
1536+
showProperties(
1537+
otherData,
1538+
"Other information on U+" + hex,
1539+
cp,
1540+
minVersion,
1541+
maxVersion,
1542+
showDevProperties,
1543+
out);
1544+
out.append("</table>\n");
1545+
}
1546+
1547+
private static void showProperties(
1548+
List<String> properties,
1549+
String title,
1550+
int cp,
1551+
VersionInfo minVersion,
1552+
VersionInfo maxVersion,
1553+
boolean showDevProperties,
1554+
Appendable out)
1555+
throws IOException {
1556+
out.append("<tr><th colspan=2>" + title + "</th></tr>" + "<tr><td width='50%'>\n");
1557+
out.append("<table width='100%'>\n");
1558+
for (int i = 0; i < properties.size() / 2; ++i) {
1559+
UnicodeProperty prop = getFactory().getProperty(properties.get(i));
14741560
if (prop.getName().equals("confusable")) continue;
1475-
if (prop.getFirstNameAlias().startsWith("cjk")) {
1476-
unihanProperties.add(propName);
1477-
continue;
1478-
}
14791561

1480-
boolean isDefault = prop.isDefault(cp);
1481-
if (isDefault) continue;
1482-
showPropertyValue(propName, cp, minVersion, maxVersion, isDefault, out);
1562+
showPropertyValue(properties.get(i), cp, minVersion, maxVersion, out);
14831563
}
14841564
out.append("</table>\n");
14851565

14861566
out.append("</td><td width='50%'>\n");
14871567

14881568
out.append("<table width='100%'>\n");
1489-
for (String propName : sortedProps) {
1490-
UnicodeProperty prop = getFactory().getProperty(propName);
1569+
for (int i = properties.size() / 2; i < properties.size(); ++i) {
1570+
UnicodeProperty prop = getFactory().getProperty(properties.get(i));
14911571
if (prop.getName().equals("confusable")) continue;
1492-
if (prop.getFirstNameAlias().startsWith("cjk")) {
1493-
continue;
1494-
}
14951572

1496-
boolean isDefault = prop.isDefault(cp);
1497-
if (!isDefault) continue;
1498-
showPropertyValue(propName, cp, minVersion, maxVersion, isDefault, out);
1573+
showPropertyValue(properties.get(i), cp, minVersion, maxVersion, out);
14991574
}
15001575
out.append("</table>\n");
1501-
1502-
out.append("</td></tr></table>\n");
1503-
if (isUnihan) {
1504-
out.append(
1505-
"<table class='propTable'>"
1506-
+ "<caption>"
1507-
+ "Unihan properties for U+"
1508-
+ hex
1509-
+ "</caption>"
1510-
+ "<tr><td width='50%'>\n");
1511-
out.append("<table width='100%'>\n");
1512-
for (int i = 0; i < unihanProperties.size() / 2; ++i) {
1513-
showPropertyValue(unihanProperties.get(i), cp, minVersion, maxVersion, false, out);
1514-
}
1515-
out.append("</table>\n");
1516-
out.append("</td><td width='50%'>\n");
1517-
out.append("<table width='100%'>\n");
1518-
for (int i = unihanProperties.size() / 2; i < unihanProperties.size(); ++i) {
1519-
showPropertyValue(unihanProperties.get(i), cp, minVersion, maxVersion, false, out);
1520-
}
1521-
out.append("</table>\n");
1522-
out.append("</td></tr></table>\n");
1523-
}
1576+
out.append("</td></tr>\n");
15241577
}
15251578

15261579
private static StringBuilder displayConfusables(int codepoint) {
@@ -1648,10 +1701,14 @@ private static void showPropertyValue(
16481701
int codePoint,
16491702
VersionInfo minVersion,
16501703
VersionInfo maxVersion,
1651-
boolean isDefault,
16521704
Appendable out)
16531705
throws IOException {
1654-
String defaultClass = isDefault ? " class='default'" : "";
1706+
String defaultClass =
1707+
getFactory().getProperty(propName).isDefault(codePoint) ? " class='default'" : "";
1708+
var indexedProperty = UcdProperty.forString(propName);
1709+
final boolean provisional =
1710+
indexedProperty != null
1711+
&& indexedProperty.getDerivedStatus() == DerivedPropertyStatus.Provisional;
16551712
class PropertyAssignment {
16561713
VersionInfo first;
16571714
VersionInfo last;
@@ -1709,12 +1766,12 @@ class PropertyAssignment {
17091766
history.add(current);
17101767
}
17111768
out.append(
1712-
"<tr><th><a target='c' href='properties.jsp?a="
1769+
"<tr><th width='50%'><a target='c' href='properties.jsp?a="
17131770
+ propName
17141771
+ "#"
17151772
+ propName
17161773
+ "'>"
1717-
+ propName
1774+
+ (provisional ? "(" + propName + ")" : propName)
17181775
+ "</a></th>");
17191776
for (PropertyAssignment assignment : history) {
17201777
String first =
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package org.unicode.props;
2+
3+
/**
4+
* A property status that can be derived from the data files defining the properties. Contrary to
5+
* PropertyStatus.java, this does not reflect distinctions such as Normative vs. Informative vs.
6+
* Contributory vs. Deprecated etc., as all of those are equal in the eyes of PropertyAliases.txt.
7+
* It does distinguish Provisional properties
8+
*/
9+
public enum DerivedPropertyStatus {
10+
/**
11+
* Properties that are part of the UCD and subject to UTC decisions. These are the ones in
12+
* PropertyAliases.txt. Their actual status may be Normative, Informative, or Contributory.
13+
*/
14+
Approved,
15+
/**
16+
* Provisional properties. These are actual UCD properties, but not in PropertyAliases.txt, and
17+
* changes to them need not be approved by the UTC. They may be removed entirely from the UCD
18+
* (though they remain in the tools, as the tools have history).
19+
*/
20+
Provisional,
21+
/**
22+
* Data in UCD files that do not specify character properties. Some of this data is exposed in
23+
* the form of properties in the tools, because all we have is a hammer.
24+
*/
25+
UCDNonProperty,
26+
/**
27+
* Properties defined outside the UCD, e.g., in UTS #39 or UTS #51. These are explicitly
28+
* described as properties in these documents.
29+
*/
30+
NonUCDProperty,
31+
/** Non-property data defined outside the UCD. */
32+
NonUCDNonProperty,
33+
}

0 commit comments

Comments
 (0)