diff --git a/scripts/parse.py b/scripts/parse.py
index 4cfedf5..b8704e2 100644
--- a/scripts/parse.py
+++ b/scripts/parse.py
@@ -7,20 +7,24 @@
RENAMED_COLS = {
"Unicode": "unicode",
- # "全码": "fullCode",
+ "全码": "fullCode",
"简码": "shortCode",
"容错码": "faultCode",
"拼音": "pinyin",
- # "字根拆解": "units",
+ "笔画数": "stroke",
+ "字根拆解": "units",
"字表来源": "source",
+ "笔画拆解": "segments",
"识别码": "flag",
}
OUTPUT_COLS = [
"unicode",
"pinyin",
+ "stroke",
"source",
"freq",
+ "unitType",
"fullCode",
"shortCode",
"faultCode",
@@ -29,6 +33,17 @@
"flag",
]
+CHAR_NAMES = {
+ "一级": "《通用规范汉字表》(2012年)一级汉字",
+ "二级": "《通用规范汉字表》(2012年)二级汉字",
+ "三级": "《通用规范汉字表》(2012年)三级汉字",
+ "GB2312": "《信息交换用汉字编码字符集》(GB/T 2312-1980)",
+ "常用字": "《现代汉语常用字表》(1988年)常用字",
+ "次常用字": "《现代汉语常用字表》(1988年)次常用字",
+ "通用字": "《现代汉语通用字表》(1988年)通用字",
+ "其他": "其他常用汉字",
+}
+
def read_source(data_dir: str):
files = Path(data_dir).glob("*.tsv")
@@ -41,13 +56,50 @@ def read_source(data_dir: str):
return df
+def get_stats(df, col="字表来源"):
+ col_temp = "字表分级"
+ col_out = "字表"
+ sources = [
+ v.strip()
+ for line in df[col].fillna("").tolist()
+ for v in line.strip().split("/")
+ if v.strip()
+ ]
+ df_stats = pd.Series(sources, name=col_temp).value_counts().reset_index()
+
+ names = CHAR_NAMES
+ df_stats[col_out] = df_stats[col_temp].apply(lambda x: names.get(x, x))
+ result = df_stats[[col_out, "count"]].to_dict("records")
+ out = {"names": names, "stats": result, "total": df.shape[0]}
+
+ for key in ["全码", "字根拆解", "笔画拆解"]:
+ out[RENAMED_COLS[key]] = df[df[key].fillna("") != ""].shape[0]
+ return out
+
+
def _to_int_array(x: str) -> list:
return [[int(u) for u in v.split(",") if u] for v in x.strip("*").split("/")]
-def tsv_to_json(data_dir: str, save_dir: str) -> None:
+def _unit_type(x):
+ keys = ["键名字根", "笔画字根", "成字字根"]
+ vals = [k[:2] if x[k] == 1 else "" for k in keys]
+ vals = [v for v in vals if v]
+ if vals:
+ return "字根({})".format(",".join(vals))
+ return ""
- save_dir = Path(save_dir)
+
+def _full_code(x):
+ fullcode = x["全码"]
+ if x["识别码"]:
+ return "{};{}".format(fullcode[:-1], fullcode[-1])
+ else:
+ return fullcode
+
+
+def tsv_to_json(data_dir: str, save_path: str) -> None:
+ save_dir = Path(save_path)
save_file = Path(save_dir, "data.json")
if not save_dir.exists():
logging.info(f"Create dir = {save_dir}")
@@ -59,31 +111,34 @@ def tsv_to_json(data_dir: str, save_dir: str) -> None:
return
logging.info(f"df = {df.shape}")
- for col in ["全码", "简码", "容错码", "字根拆解", "笔画拆解", "识别码"]:
+ for col in ["全码", "简码", "容错码", "字根拆解", "笔画拆解", "识别码", '笔画数']:
df[col] = df[col].fillna("").astype(str)
for col in ["现代汉语语料库字频(%)", "刑红兵25亿字语料字频(百万)"]:
df[col] = df[col].fillna(0).astype(float)
- df.index = df["汉字"]
- df = df.rename(columns=RENAMED_COLS)
- df["fullCode"] = df[["全码", "flag"]].apply(
- lambda x: "{};{}".format(x["全码"][:-1], x["全码"][-1]) if x["flag"] else x["全码"]
- , axis=1)
+ df["fullCode"] = df.apply(_full_code, axis=1)
df["units"] = df["字根拆解"].apply(lambda x: " ".join(x.split("※")))
df["segments"] = df["笔画拆解"].apply(_to_int_array)
keys = ["现代汉语语料库字频(%)", "刑红兵25亿字语料字频(百万)"]
df["freq"] = df.apply(lambda x: [x[k] for k in keys], axis=1)
- # keys = ["五笔常用前1500", "一级简码", "二级简码", "键名字根", "笔画字根", "成字字根"]
- # df["level"] = df.apply(lambda x: [x[k] for k in keys], axis=1)
+ # "五笔常用前1500", "一级简码", "二级简码",
+ df["unitType"] = df.apply(_unit_type, axis=1)
+
+ stats = get_stats(df)
+ renames = {k: v for k, v in RENAMED_COLS.items() if v not in df.columns}
+ df = df.rename(columns=renames)
+ df = df.set_index("汉字")
df2 = df[OUTPUT_COLS]
- logging.info(f"output data = {df.shape}")
+ logging.info(f"output data = {df2.shape}")
out = df2.to_dict("index")
+ result = {"stats": stats, "chars": out}
+
logging.info(f"save to = {save_file}")
with open(save_file, "w") as f:
- json.dump(out, f, indent=None, ensure_ascii=False)
+ json.dump(result, f, indent=None, ensure_ascii=False)
if __name__ == "__main__":
diff --git a/web/css/styles.css b/web/css/styles.css
index ca29c61..29f2457 100644
--- a/web/css/styles.css
+++ b/web/css/styles.css
@@ -10,6 +10,7 @@
--area-color: #272b35;
--code-color: #f5f5f5;
}
+
@font-face {
font-family: "Wubi Units";
src: url("../font/黑体字根.ttf");
@@ -76,40 +77,6 @@ hr {
height: 5px;
}
-button {
- padding: 12px 24px;
- border: none;
- border-radius: 4px;
- cursor: pointer;
- font-size: 16px;
- /* font-weight: 500; */
- text-transform: uppercase;
- transition: background-color 0.3s;
- box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
- margin: 0 5px;
-}
-
-input[type="text"] {
- color: var(--text-color);
- background-color: var(--background-color);
- border: 1px solid var(--primary-color);
- border-radius: 3px;
- padding: 8px;
- transition: border-color 0.3s;
- box-sizing: border-box;
- width: 100%;
- max-width: 300px;
- margin-right: 10px;
- font-size: 16px;
- /* font-family: "Roboto Mono", monospace; */
- border: none;
- outline: none;
- padding: 10px;
- border-radius: 25px;
- flex: 1;
- font-size: 16px;
-}
-
.search-container {
display: flex;
align-items: center;
@@ -123,14 +90,28 @@ input[type="text"] {
margin: 2rem auto;
}
+
+input[type="text"] {
+ color: var(--text-color);
+ background-color: var(--background-color);
+ width: 100%;
+ flex: 1;
+ outline: none;
+ border: none;
+ border-radius: 15px;
+ padding: 10px 20px;
+ font-size: larger;
+ margin-right: 5px;
+}
+
button {
background-color: var(--more-color);
color: var(--text-color);
border: none;
- border-radius: 25px;
+ border-radius: 15px;
padding: 10px 20px;
cursor: pointer;
- font-size: 16px;
+ font-size: larger;
margin-left: 10px;
}
@@ -138,6 +119,7 @@ button:hover {
background-color: var(--accent-color);
}
+
table {
border-collapse: collapse;
max-width: 95%;
@@ -167,7 +149,7 @@ input[type="text"]:hover {
background-color: var(--code-color);
}
-td > span {
+td>span {
font-size: xxx-large;
font-family: STKaiti, KaiTi, serif;
}
@@ -176,6 +158,10 @@ td div {
text-align: center;
}
+td li div {
+ display: inline;
+}
+
ul {
list-style-type: none;
padding: 0;
@@ -220,6 +206,10 @@ code span {
text-align: center;
}
+#note-area .note {
+ font-size: smaller;
+}
+
.tooltip {
position: relative;
display: inline-block;
@@ -239,9 +229,11 @@ code span {
padding: 5px 0;
position: absolute;
z-index: 1;
- bottom: 125%; /* 在元素上方 */
+ bottom: 125%;
+ /* 在元素上方 */
left: 50%;
- margin-left: -60px; /* 调整位置 */
+ margin-left: -60px;
+ /* 调整位置 */
opacity: 0;
transition: opacity 0.3s;
}
@@ -277,4 +269,4 @@ code span {
color: #f8f8f8;
border: 1px solid #666;
}
-}
+}
\ No newline at end of file
diff --git a/web/index.html b/web/index.html
index d7b4bd3..643bc5b 100644
--- a/web/index.html
+++ b/web/index.html
@@ -20,12 +20,10 @@
五笔拆解查询
- 📝 注意这里五笔版本是1986版(王码)五笔。
- ⚠️ 标识表示全码和容错码存在一定争议(比如起笔或末笔笔画顺序)。
-
+
diff --git a/web/js/render.js b/web/js/render.js
index 0a396c2..9eaf9d5 100644
--- a/web/js/render.js
+++ b/web/js/render.js
@@ -1,25 +1,48 @@
-const dataFile = "data/data.json";
-let data = {};
-fetch(dataFile)
- .then((response) => {
- if (!response.ok) {
- throw new Error("Network response was not ok");
- }
- return response.json();
- })
- .then((jsonData) => {
- data = jsonData;
- })
- .catch((error) => {
- console.error("Read JSON error:", error);
- });
+document.addEventListener("DOMContentLoaded", () => {
+ const dataFile = "data/data.json";
-// event
-document.getElementById("query-button").addEventListener("click", queryHanzi);
-document
- .getElementById("query-text")
- .addEventListener("keydown", function (event) {
- if (event.key === "Enter") {
- queryHanzi();
- }
- });
+ // init data
+ fetch(dataFile)
+ .then((response) => {
+ if (!response.ok) {
+ throw new Error("Network response error");
+ }
+ return response.json();
+ })
+ .then((data) => {
+ const charData = data.chars;
+ const statsData = data.stats;
+
+ const paragraphs = [
+ "📝 注意这里五笔版本是1986版(王码)五笔。
",
+ `当前收录汉字: ${statsData.total}字(五笔全码: ${statsData.fullCode}, 字根拆解:${statsData.units},图解:${statsData.segments})。
`,
+ '⚠️ 标识表示全码和容错码存在一定争议(比如起笔或末笔笔画顺序)。
',
+ ];
+
+ const note = document.getElementById('note-area');
+ const para = document.getElementById('note-warning');
+ para.innerHTML = "";
+ paragraphs.forEach(text => {
+ const more = document.createElement('p');
+ more.innerHTML = text;
+ note.insertBefore(more, para);
+ });
+
+ // event
+ document
+ .getElementById("query-button")
+ .addEventListener("click", () => {
+ queryHanzi(charData, statsData);
+ });
+ document
+ .getElementById("query-text")
+ .addEventListener("keydown", function (event) {
+ if (event.key === "Enter") {
+ queryHanzi(charData, statsData);
+ }
+ });
+ })
+ .catch((error) => {
+ console.error("Read JSON error:", error);
+ });
+});
diff --git a/web/js/script.js b/web/js/script.js
index b30b489..ccaeb9d 100644
--- a/web/js/script.js
+++ b/web/js/script.js
@@ -37,7 +37,7 @@ function plotWubiSegments(target, charData, segments) {
}
}
-function getListData(keys, values) {
+function getListData(keys, values, charNames) {
const itemList = document.createElement("ul");
if (keys === null) {
let arr = values;
@@ -59,7 +59,7 @@ function getListData(keys, values) {
if (keys[index] == "收录字表") {
const tip = document.createElement("strong");
tip.innerText = keys[index] + ":";
- const container = getHanziList(item);
+ const container = getHanziList(item, charNames);
listItem.append(tip);
listItem.append(container);
} else {
@@ -73,8 +73,13 @@ function getListData(keys, values) {
.map((item) => ` ${item}
`)
.join("
");
} else {
+ const ignores = ["笔画数量", "备注"];
val = item.replace(/;(.+)/, "$1");
- val = ` ${val}
`;
+ if (ignores.includes(keys[index])) {
+ val = ` ${val}`;
+ } else {
+ val = ` ${val}
`;
+ }
}
if (item.startsWith("*")) {
val = ` ⚠️${val}`;
@@ -91,22 +96,12 @@ function getListData(keys, values) {
return itemList;
}
-function getHanziList(sources) {
- const names = {
- 一级: "《通用规范汉字表》(2012年)一级汉字",
- 二级: "《通用规范汉字表》(2012年)二级汉字",
- 三级: "《通用规范汉字表》(2012年)三级汉字",
- GB2312: "《信息交换用汉字编码字符集》(GB/T 2312-1980)",
- 常用字: "《现代汉语常用字表》(1988年)常用字",
- 次常用字: "《现代汉语常用字表》(1988年)次常用字",
- 通用字: "《现代汉语通用字表》(1988年)常用字",
- 其他: "其他常用汉字",
- };
+function getHanziList(sources, charNames) {
// console.log(sources);
const values = sources.split("/");
values.forEach((item) => {
const name = item.charAt(0);
- names[item];
+ charNames[item];
const div = document.createElement("div");
});
const container = document.createElement("div");
@@ -117,7 +112,7 @@ function getHanziList(sources) {
const tooltipText = document.createElement("span");
tooltipText.className = "tooltiptext";
- tooltipText.textContent = names[item];
+ tooltipText.textContent = charNames[item];
tooltipDiv.appendChild(tooltipText);
container.appendChild(tooltipDiv);
@@ -125,8 +120,7 @@ function getHanziList(sources) {
return container;
}
-function createTableRow(index, data, char) {
- const charInfo = data[char];
+function createTableRow(index, char, charInfo, charNames) {
const row = document.createElement("tr");
const indexCell = document.createElement("td");
@@ -140,18 +134,20 @@ function createTableRow(index, data, char) {
charCell.innerHTML = `${char}`;
infoCell.appendChild(
getListData(
- ["汉语拼音", "UNICODE", "收录字表"],
- [charInfo.pinyin, charInfo.unicode, charInfo.source]
+ ["UNICODE", "汉语拼音", "笔画数量", "收录字表"],
+ [charInfo.unicode, charInfo.pinyin, charInfo.stroke, charInfo.source],
+ charNames
)
);
codeCell.appendChild(
getListData(
- ["全码", "拆解", "识别"],
- [charInfo.fullCode, charInfo.units, charInfo.flag]
+ ["全码", "拆解", "识别", "备注"],
+ [charInfo.fullCode, charInfo.units, charInfo.flag, charInfo.unitType],
+ charNames
)
);
extraCell.appendChild(
- getListData(["简码", "容错"], [charInfo.shortCode, charInfo.faultCode])
+ getListData(["简码", "容错"], [charInfo.shortCode, charInfo.faultCode], charNames)
);
row.appendChild(indexCell);
@@ -195,13 +191,14 @@ function initTable(show) {
tableHead.append(headRow);
}
-function queryHanzi() {
+function queryHanzi(charData, statsData) {
// only top 10 chars
+ const charNames = statsData.names;
const input = document.getElementById("query-text").value.trim();
const chars = input.replace(/[a-zA-Z\d\s]/g, "").slice(0, 10);
- const result = document.getElementById("result");
- result.innerText = "";
+ const warning = document.getElementById("note-warning");
+ warning.innerText = "";
const tableBody = document.querySelector("#data-table tbody");
tableBody.innerHTML = ""; // clean table
@@ -209,8 +206,8 @@ function queryHanzi() {
let valid = 0;
Array.from(chars).forEach((char, index) => {
// console.log(char);
- if (char in data) {
- row = createTableRow(index, data, char);
+ if (char in charData) {
+ row = createTableRow(index, char, charData[char], charNames);
tableBody.appendChild(row);
valid += 1;
}
@@ -218,11 +215,11 @@ function queryHanzi() {
initTable(valid !== 0);
if (valid === 0) {
if (chars) {
- result.innerText = "非常用汉字,请尝试其他。";
+ warning.innerText = "🚫 非规范汉字或罕用,请尝试其他。";
} else {
- result.innerText = "请输入常用汉字。";
+ warning.innerText = "❗ 请输入常用规范汉字。";
}
} else {
- result.innerText = "";
+ warning.innerText = "";
}
}