From 19596ab25ce897d295000c70b162bb1585aae5e4 Mon Sep 17 00:00:00 2001 From: hantang <34468720+hantang@users.noreply.github.com> Date: Tue, 15 Oct 2024 16:32:35 +0800 Subject: [PATCH] Update src --- scripts/parse.py | 83 ++++++++++++++++++++++++++++++++++++++-------- web/css/styles.css | 72 ++++++++++++++++++---------------------- web/index.html | 6 ++-- web/js/render.js | 71 +++++++++++++++++++++++++-------------- web/js/script.js | 59 ++++++++++++++++---------------- 5 files changed, 178 insertions(+), 113 deletions(-) diff --git a/scripts/parse.py b/scripts/parse.py index 4cfedf5..b8704e2 100644 --- a/scripts/parse.py +++ b/scripts/parse.py @@ -7,20 +7,24 @@ RENAMED_COLS = { "Unicode": "unicode", - # "全码": "fullCode", + "全码": "fullCode", "简码": "shortCode", "容错码": "faultCode", "拼音": "pinyin", - # "字根拆解": "units", + "笔画数": "stroke", + "字根拆解": "units", "字表来源": "source", + "笔画拆解": "segments", "识别码": "flag", } OUTPUT_COLS = [ "unicode", "pinyin", + "stroke", "source", "freq", + "unitType", "fullCode", "shortCode", "faultCode", @@ -29,6 +33,17 @@ "flag", ] +CHAR_NAMES = { + "一级": "《通用规范汉字表》(2012年)一级汉字", + "二级": "《通用规范汉字表》(2012年)二级汉字", + "三级": "《通用规范汉字表》(2012年)三级汉字", + "GB2312": "《信息交换用汉字编码字符集》(GB/T 2312-1980)", + "常用字": "《现代汉语常用字表》(1988年)常用字", + "次常用字": "《现代汉语常用字表》(1988年)次常用字", + "通用字": "《现代汉语通用字表》(1988年)通用字", + "其他": "其他常用汉字", +} + def read_source(data_dir: str): files = Path(data_dir).glob("*.tsv") @@ -41,13 +56,50 @@ def read_source(data_dir: str): return df +def get_stats(df, col="字表来源"): + col_temp = "字表分级" + col_out = "字表" + sources = [ + v.strip() + for line in df[col].fillna("").tolist() + for v in line.strip().split("/") + if v.strip() + ] + df_stats = pd.Series(sources, name=col_temp).value_counts().reset_index() + + names = CHAR_NAMES + df_stats[col_out] = df_stats[col_temp].apply(lambda x: names.get(x, x)) + result = df_stats[[col_out, "count"]].to_dict("records") + out = {"names": names, "stats": result, "total": df.shape[0]} + + for key in ["全码", "字根拆解", "笔画拆解"]: + out[RENAMED_COLS[key]] = df[df[key].fillna("") != ""].shape[0] + return out + + def _to_int_array(x: str) -> list: return [[int(u) for u in v.split(",") if u] for v in x.strip("*").split("/")] -def tsv_to_json(data_dir: str, save_dir: str) -> None: +def _unit_type(x): + keys = ["键名字根", "笔画字根", "成字字根"] + vals = [k[:2] if x[k] == 1 else "" for k in keys] + vals = [v for v in vals if v] + if vals: + return "字根({})".format(",".join(vals)) + return "" - save_dir = Path(save_dir) + +def _full_code(x): + fullcode = x["全码"] + if x["识别码"]: + return "{};{}".format(fullcode[:-1], fullcode[-1]) + else: + return fullcode + + +def tsv_to_json(data_dir: str, save_path: str) -> None: + save_dir = Path(save_path) save_file = Path(save_dir, "data.json") if not save_dir.exists(): logging.info(f"Create dir = {save_dir}") @@ -59,31 +111,34 @@ def tsv_to_json(data_dir: str, save_dir: str) -> None: return logging.info(f"df = {df.shape}") - for col in ["全码", "简码", "容错码", "字根拆解", "笔画拆解", "识别码"]: + for col in ["全码", "简码", "容错码", "字根拆解", "笔画拆解", "识别码", '笔画数']: df[col] = df[col].fillna("").astype(str) for col in ["现代汉语语料库字频(%)", "刑红兵25亿字语料字频(百万)"]: df[col] = df[col].fillna(0).astype(float) - df.index = df["汉字"] - df = df.rename(columns=RENAMED_COLS) - df["fullCode"] = df[["全码", "flag"]].apply( - lambda x: "{};{}".format(x["全码"][:-1], x["全码"][-1]) if x["flag"] else x["全码"] - , axis=1) + df["fullCode"] = df.apply(_full_code, axis=1) df["units"] = df["字根拆解"].apply(lambda x: " ".join(x.split("※"))) df["segments"] = df["笔画拆解"].apply(_to_int_array) keys = ["现代汉语语料库字频(%)", "刑红兵25亿字语料字频(百万)"] df["freq"] = df.apply(lambda x: [x[k] for k in keys], axis=1) - # keys = ["五笔常用前1500", "一级简码", "二级简码", "键名字根", "笔画字根", "成字字根"] - # df["level"] = df.apply(lambda x: [x[k] for k in keys], axis=1) + # "五笔常用前1500", "一级简码", "二级简码", + df["unitType"] = df.apply(_unit_type, axis=1) + + stats = get_stats(df) + renames = {k: v for k, v in RENAMED_COLS.items() if v not in df.columns} + df = df.rename(columns=renames) + df = df.set_index("汉字") df2 = df[OUTPUT_COLS] - logging.info(f"output data = {df.shape}") + logging.info(f"output data = {df2.shape}") out = df2.to_dict("index") + result = {"stats": stats, "chars": out} + logging.info(f"save to = {save_file}") with open(save_file, "w") as f: - json.dump(out, f, indent=None, ensure_ascii=False) + json.dump(result, f, indent=None, ensure_ascii=False) if __name__ == "__main__": diff --git a/web/css/styles.css b/web/css/styles.css index ca29c61..29f2457 100644 --- a/web/css/styles.css +++ b/web/css/styles.css @@ -10,6 +10,7 @@ --area-color: #272b35; --code-color: #f5f5f5; } + @font-face { font-family: "Wubi Units"; src: url("../font/黑体字根.ttf"); @@ -76,40 +77,6 @@ hr { height: 5px; } -button { - padding: 12px 24px; - border: none; - border-radius: 4px; - cursor: pointer; - font-size: 16px; - /* font-weight: 500; */ - text-transform: uppercase; - transition: background-color 0.3s; - box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2); - margin: 0 5px; -} - -input[type="text"] { - color: var(--text-color); - background-color: var(--background-color); - border: 1px solid var(--primary-color); - border-radius: 3px; - padding: 8px; - transition: border-color 0.3s; - box-sizing: border-box; - width: 100%; - max-width: 300px; - margin-right: 10px; - font-size: 16px; - /* font-family: "Roboto Mono", monospace; */ - border: none; - outline: none; - padding: 10px; - border-radius: 25px; - flex: 1; - font-size: 16px; -} - .search-container { display: flex; align-items: center; @@ -123,14 +90,28 @@ input[type="text"] { margin: 2rem auto; } + +input[type="text"] { + color: var(--text-color); + background-color: var(--background-color); + width: 100%; + flex: 1; + outline: none; + border: none; + border-radius: 15px; + padding: 10px 20px; + font-size: larger; + margin-right: 5px; +} + button { background-color: var(--more-color); color: var(--text-color); border: none; - border-radius: 25px; + border-radius: 15px; padding: 10px 20px; cursor: pointer; - font-size: 16px; + font-size: larger; margin-left: 10px; } @@ -138,6 +119,7 @@ button:hover { background-color: var(--accent-color); } + table { border-collapse: collapse; max-width: 95%; @@ -167,7 +149,7 @@ input[type="text"]:hover { background-color: var(--code-color); } -td > span { +td>span { font-size: xxx-large; font-family: STKaiti, KaiTi, serif; } @@ -176,6 +158,10 @@ td div { text-align: center; } +td li div { + display: inline; +} + ul { list-style-type: none; padding: 0; @@ -220,6 +206,10 @@ code span { text-align: center; } +#note-area .note { + font-size: smaller; +} + .tooltip { position: relative; display: inline-block; @@ -239,9 +229,11 @@ code span { padding: 5px 0; position: absolute; z-index: 1; - bottom: 125%; /* 在元素上方 */ + bottom: 125%; + /* 在元素上方 */ left: 50%; - margin-left: -60px; /* 调整位置 */ + margin-left: -60px; + /* 调整位置 */ opacity: 0; transition: opacity 0.3s; } @@ -277,4 +269,4 @@ code span { color: #f8f8f8; border: 1px solid #666; } -} +} \ No newline at end of file diff --git a/web/index.html b/web/index.html index d7b4bd3..643bc5b 100644 --- a/web/index.html +++ b/web/index.html @@ -20,12 +20,10 @@

五笔拆解查询

- +
-

📝 注意这里五笔版本是1986版(王码)五笔。

-

⚠️ 标识表示全码和容错码存在一定争议(比如起笔或末笔笔画顺序)。

-

+

diff --git a/web/js/render.js b/web/js/render.js index 0a396c2..9eaf9d5 100644 --- a/web/js/render.js +++ b/web/js/render.js @@ -1,25 +1,48 @@ -const dataFile = "data/data.json"; -let data = {}; -fetch(dataFile) - .then((response) => { - if (!response.ok) { - throw new Error("Network response was not ok"); - } - return response.json(); - }) - .then((jsonData) => { - data = jsonData; - }) - .catch((error) => { - console.error("Read JSON error:", error); - }); +document.addEventListener("DOMContentLoaded", () => { + const dataFile = "data/data.json"; -// event -document.getElementById("query-button").addEventListener("click", queryHanzi); -document - .getElementById("query-text") - .addEventListener("keydown", function (event) { - if (event.key === "Enter") { - queryHanzi(); - } - }); + // init data + fetch(dataFile) + .then((response) => { + if (!response.ok) { + throw new Error("Network response error"); + } + return response.json(); + }) + .then((data) => { + const charData = data.chars; + const statsData = data.stats; + + const paragraphs = [ + "

📝 注意这里五笔版本是1986版(王码)五笔。

", + `

当前收录汉字: ${statsData.total}字(五笔全码: ${statsData.fullCode}, 字根拆解:${statsData.units},图解:${statsData.segments})。

`, + '

⚠️ 标识表示全码和容错码存在一定争议(比如起笔或末笔笔画顺序)。

', + ]; + + const note = document.getElementById('note-area'); + const para = document.getElementById('note-warning'); + para.innerHTML = ""; + paragraphs.forEach(text => { + const more = document.createElement('p'); + more.innerHTML = text; + note.insertBefore(more, para); + }); + + // event + document + .getElementById("query-button") + .addEventListener("click", () => { + queryHanzi(charData, statsData); + }); + document + .getElementById("query-text") + .addEventListener("keydown", function (event) { + if (event.key === "Enter") { + queryHanzi(charData, statsData); + } + }); + }) + .catch((error) => { + console.error("Read JSON error:", error); + }); +}); diff --git a/web/js/script.js b/web/js/script.js index b30b489..ccaeb9d 100644 --- a/web/js/script.js +++ b/web/js/script.js @@ -37,7 +37,7 @@ function plotWubiSegments(target, charData, segments) { } } -function getListData(keys, values) { +function getListData(keys, values, charNames) { const itemList = document.createElement("ul"); if (keys === null) { let arr = values; @@ -59,7 +59,7 @@ function getListData(keys, values) { if (keys[index] == "收录字表") { const tip = document.createElement("strong"); tip.innerText = keys[index] + ":"; - const container = getHanziList(item); + const container = getHanziList(item, charNames); listItem.append(tip); listItem.append(container); } else { @@ -73,8 +73,13 @@ function getListData(keys, values) { .map((item) => `  ${item}`) .join("
"); } else { + const ignores = ["笔画数量", "备注"]; val = item.replace(/;(.+)/, "$1"); - val = `  ${val}`; + if (ignores.includes(keys[index])) { + val = `  ${val}`; + } else { + val = `  ${val}`; + } } if (item.startsWith("*")) { val = ` ⚠️${val}`; @@ -91,22 +96,12 @@ function getListData(keys, values) { return itemList; } -function getHanziList(sources) { - const names = { - 一级: "《通用规范汉字表》(2012年)一级汉字", - 二级: "《通用规范汉字表》(2012年)二级汉字", - 三级: "《通用规范汉字表》(2012年)三级汉字", - GB2312: "《信息交换用汉字编码字符集》(GB/T 2312-1980)", - 常用字: "《现代汉语常用字表》(1988年)常用字", - 次常用字: "《现代汉语常用字表》(1988年)次常用字", - 通用字: "《现代汉语通用字表》(1988年)常用字", - 其他: "其他常用汉字", - }; +function getHanziList(sources, charNames) { // console.log(sources); const values = sources.split("/"); values.forEach((item) => { const name = item.charAt(0); - names[item]; + charNames[item]; const div = document.createElement("div"); }); const container = document.createElement("div"); @@ -117,7 +112,7 @@ function getHanziList(sources) { const tooltipText = document.createElement("span"); tooltipText.className = "tooltiptext"; - tooltipText.textContent = names[item]; + tooltipText.textContent = charNames[item]; tooltipDiv.appendChild(tooltipText); container.appendChild(tooltipDiv); @@ -125,8 +120,7 @@ function getHanziList(sources) { return container; } -function createTableRow(index, data, char) { - const charInfo = data[char]; +function createTableRow(index, char, charInfo, charNames) { const row = document.createElement("tr"); const indexCell = document.createElement("td"); @@ -140,18 +134,20 @@ function createTableRow(index, data, char) { charCell.innerHTML = `${char}`; infoCell.appendChild( getListData( - ["汉语拼音", "UNICODE", "收录字表"], - [charInfo.pinyin, charInfo.unicode, charInfo.source] + ["UNICODE", "汉语拼音", "笔画数量", "收录字表"], + [charInfo.unicode, charInfo.pinyin, charInfo.stroke, charInfo.source], + charNames ) ); codeCell.appendChild( getListData( - ["全码", "拆解", "识别"], - [charInfo.fullCode, charInfo.units, charInfo.flag] + ["全码", "拆解", "识别", "备注"], + [charInfo.fullCode, charInfo.units, charInfo.flag, charInfo.unitType], + charNames ) ); extraCell.appendChild( - getListData(["简码", "容错"], [charInfo.shortCode, charInfo.faultCode]) + getListData(["简码", "容错"], [charInfo.shortCode, charInfo.faultCode], charNames) ); row.appendChild(indexCell); @@ -195,13 +191,14 @@ function initTable(show) { tableHead.append(headRow); } -function queryHanzi() { +function queryHanzi(charData, statsData) { // only top 10 chars + const charNames = statsData.names; const input = document.getElementById("query-text").value.trim(); const chars = input.replace(/[a-zA-Z\d\s]/g, "").slice(0, 10); - const result = document.getElementById("result"); - result.innerText = ""; + const warning = document.getElementById("note-warning"); + warning.innerText = ""; const tableBody = document.querySelector("#data-table tbody"); tableBody.innerHTML = ""; // clean table @@ -209,8 +206,8 @@ function queryHanzi() { let valid = 0; Array.from(chars).forEach((char, index) => { // console.log(char); - if (char in data) { - row = createTableRow(index, data, char); + if (char in charData) { + row = createTableRow(index, char, charData[char], charNames); tableBody.appendChild(row); valid += 1; } @@ -218,11 +215,11 @@ function queryHanzi() { initTable(valid !== 0); if (valid === 0) { if (chars) { - result.innerText = "非常用汉字,请尝试其他。"; + warning.innerText = "🚫 非规范汉字或罕用,请尝试其他。"; } else { - result.innerText = "请输入常用汉字。"; + warning.innerText = "❗ 请输入常用规范汉字。"; } } else { - result.innerText = ""; + warning.innerText = ""; } }