From 19596ab25ce897d295000c70b162bb1585aae5e4 Mon Sep 17 00:00:00 2001
From: hantang <34468720+hantang@users.noreply.github.com>
Date: Tue, 15 Oct 2024 16:32:35 +0800
Subject: [PATCH] Update src

---
 scripts/parse.py   | 83 ++++++++++++++++++++++++++++++++++++++--------
 web/css/styles.css | 72 ++++++++++++++++++----------------------
 web/index.html     |  6 ++--
 web/js/render.js   | 71 +++++++++++++++++++++++++--------------
 web/js/script.js   | 59 ++++++++++++++++----------------
 5 files changed, 178 insertions(+), 113 deletions(-)

diff --git a/scripts/parse.py b/scripts/parse.py
index 4cfedf5..b8704e2 100644
--- a/scripts/parse.py
+++ b/scripts/parse.py
@@ -7,20 +7,24 @@
 
 RENAMED_COLS = {
     "Unicode": "unicode",
-    # "全码": "fullCode",
+    "全码": "fullCode",
     "简码": "shortCode",
     "容错码": "faultCode",
     "拼音": "pinyin",
-    # "字根拆解": "units",
+    "笔画数": "stroke",
+    "字根拆解": "units",
     "字表来源": "source",
+    "笔画拆解": "segments",
     "识别码": "flag",
 }
 
 OUTPUT_COLS = [
     "unicode",
     "pinyin",
+    "stroke",
     "source",
     "freq",
+    "unitType",
     "fullCode",
     "shortCode",
     "faultCode",
@@ -29,6 +33,17 @@
     "flag",
 ]
 
+CHAR_NAMES = {
+    "一级": "《通用规范汉字表》（2012年）一级汉字",
+    "二级": "《通用规范汉字表》（2012年）二级汉字",
+    "三级": "《通用规范汉字表》（2012年）三级汉字",
+    "GB2312": "《信息交换用汉字编码字符集》（GB/T 2312-1980）",
+    "常用字": "《现代汉语常用字表》（1988年）常用字",
+    "次常用字": "《现代汉语常用字表》（1988年）次常用字",
+    "通用字": "《现代汉语通用字表》（1988年）通用字",
+    "其他": "其他常用汉字",
+}
+
 
 def read_source(data_dir: str):
     files = Path(data_dir).glob("*.tsv")
@@ -41,13 +56,50 @@ def read_source(data_dir: str):
     return df
 
 
+def get_stats(df, col="字表来源"):
+    col_temp = "字表分级"
+    col_out = "字表"
+    sources = [
+        v.strip()
+        for line in df[col].fillna("").tolist()
+        for v in line.strip().split("/")
+        if v.strip()
+    ]
+    df_stats = pd.Series(sources, name=col_temp).value_counts().reset_index()
+
+    names = CHAR_NAMES
+    df_stats[col_out] = df_stats[col_temp].apply(lambda x: names.get(x, x))
+    result = df_stats[[col_out, "count"]].to_dict("records")
+    out = {"names": names, "stats": result, "total": df.shape[0]}
+
+    for key in ["全码", "字根拆解", "笔画拆解"]:
+        out[RENAMED_COLS[key]] = df[df[key].fillna("") != ""].shape[0]
+    return out
+
+
 def _to_int_array(x: str) -> list:
     return [[int(u) for u in v.split(",") if u] for v in x.strip("*").split("/")]
 
 
-def tsv_to_json(data_dir: str, save_dir: str) -> None:
+def _unit_type(x):
+    keys = ["键名字根", "笔画字根", "成字字根"]
+    vals = [k[:2] if x[k] == 1 else "" for k in keys]
+    vals = [v for v in vals if v]
+    if vals:
+        return "字根（{}）".format("，".join(vals))
+    return ""
 
-    save_dir = Path(save_dir)
+
+def _full_code(x):
+    fullcode = x["全码"]
+    if x["识别码"]:
+        return "{};{}".format(fullcode[:-1], fullcode[-1])
+    else:
+        return fullcode
+
+
+def tsv_to_json(data_dir: str, save_path: str) -> None:
+    save_dir = Path(save_path)
     save_file = Path(save_dir, "data.json")
     if not save_dir.exists():
         logging.info(f"Create dir = {save_dir}")
@@ -59,31 +111,34 @@ def tsv_to_json(data_dir: str, save_dir: str) -> None:
         return
     logging.info(f"df = {df.shape}")
 
-    for col in ["全码", "简码", "容错码", "字根拆解", "笔画拆解", "识别码"]:
+    for col in ["全码", "简码", "容错码", "字根拆解", "笔画拆解", "识别码", '笔画数']:
         df[col] = df[col].fillna("").astype(str)
     for col in ["现代汉语语料库字频（%）", "刑红兵25亿字语料字频（百万）"]:
         df[col] = df[col].fillna(0).astype(float)
 
-    df.index = df["汉字"]
-    df = df.rename(columns=RENAMED_COLS)
-    df["fullCode"] = df[["全码", "flag"]].apply(
-        lambda x: "{};{}".format(x["全码"][:-1], x["全码"][-1]) if x["flag"] else x["全码"]
-    , axis=1)
+    df["fullCode"] = df.apply(_full_code, axis=1)
     df["units"] = df["字根拆解"].apply(lambda x: " ".join(x.split("※")))
     df["segments"] = df["笔画拆解"].apply(_to_int_array)
 
     keys = ["现代汉语语料库字频（%）", "刑红兵25亿字语料字频（百万）"]
     df["freq"] = df.apply(lambda x: [x[k] for k in keys], axis=1)
-    # keys = ["五笔常用前1500", "一级简码", "二级简码", "键名字根", "笔画字根", "成字字根"]
-    # df["level"] = df.apply(lambda x: [x[k] for k in keys], axis=1)
+    # "五笔常用前1500", "一级简码", "二级简码",
+    df["unitType"] = df.apply(_unit_type, axis=1)
+
+    stats = get_stats(df)
+    renames = {k: v for k, v in RENAMED_COLS.items() if v not in df.columns}
+    df = df.rename(columns=renames)
+    df = df.set_index("汉字")
 
     df2 = df[OUTPUT_COLS]
-    logging.info(f"output data = {df.shape}")
+    logging.info(f"output data = {df2.shape}")
 
     out = df2.to_dict("index")
+    result = {"stats": stats, "chars": out}
+
     logging.info(f"save to = {save_file}")
     with open(save_file, "w") as f:
-        json.dump(out, f, indent=None, ensure_ascii=False)
+        json.dump(result, f, indent=None, ensure_ascii=False)
 
 
 if __name__ == "__main__":
diff --git a/web/css/styles.css b/web/css/styles.css
index ca29c61..29f2457 100644
--- a/web/css/styles.css
+++ b/web/css/styles.css
@@ -10,6 +10,7 @@
   --area-color: #272b35;
   --code-color: #f5f5f5;
 }
+
 @font-face {
   font-family: "Wubi Units";
   src: url("../font/黑体字根.ttf");
@@ -76,40 +77,6 @@ hr {
   height: 5px;
 }
 
-button {
-  padding: 12px 24px;
-  border: none;
-  border-radius: 4px;
-  cursor: pointer;
-  font-size: 16px;
-  /* font-weight: 500; */
-  text-transform: uppercase;
-  transition: background-color 0.3s;
-  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
-  margin: 0 5px;
-}
-
-input[type="text"] {
-  color: var(--text-color);
-  background-color: var(--background-color);
-  border: 1px solid var(--primary-color);
-  border-radius: 3px;
-  padding: 8px;
-  transition: border-color 0.3s;
-  box-sizing: border-box;
-  width: 100%;
-  max-width: 300px;
-  margin-right: 10px;
-  font-size: 16px;
-  /* font-family: "Roboto Mono", monospace; */
-  border: none;
-  outline: none;
-  padding: 10px;
-  border-radius: 25px;
-  flex: 1;
-  font-size: 16px;
-}
-
 .search-container {
   display: flex;
   align-items: center;
@@ -123,14 +90,28 @@ input[type="text"] {
   margin: 2rem auto;
 }
 
+
+input[type="text"] {
+  color: var(--text-color);
+  background-color: var(--background-color);
+  width: 100%;
+  flex: 1;
+  outline: none;
+  border: none;
+  border-radius: 15px;
+  padding: 10px 20px;
+  font-size: larger;
+  margin-right: 5px;
+}
+
 button {
   background-color: var(--more-color);
   color: var(--text-color);
   border: none;
-  border-radius: 25px;
+  border-radius: 15px;
   padding: 10px 20px;
   cursor: pointer;
-  font-size: 16px;
+  font-size: larger;
   margin-left: 10px;
 }
 
@@ -138,6 +119,7 @@ button:hover {
   background-color: var(--accent-color);
 }
 
+
 table {
   border-collapse: collapse;
   max-width: 95%;
@@ -167,7 +149,7 @@ input[type="text"]:hover {
   background-color: var(--code-color);
 }
 
-td > span {
+td>span {
   font-size: xxx-large;
   font-family: STKaiti, KaiTi, serif;
 }
@@ -176,6 +158,10 @@ td div {
   text-align: center;
 }
 
+td li div {
+  display: inline;
+}
+
 ul {
   list-style-type: none;
   padding: 0;
@@ -220,6 +206,10 @@ code span {
   text-align: center;
 }
 
+#note-area .note {
+  font-size: smaller;
+}
+
 .tooltip {
   position: relative;
   display: inline-block;
@@ -239,9 +229,11 @@ code span {
   padding: 5px 0;
   position: absolute;
   z-index: 1;
-  bottom: 125%; /* 在元素上方 */
+  bottom: 125%;
+  /* 在元素上方 */
   left: 50%;
-  margin-left: -60px; /* 调整位置 */
+  margin-left: -60px;
+  /* 调整位置 */
   opacity: 0;
   transition: opacity 0.3s;
 }
@@ -277,4 +269,4 @@ code span {
     color: #f8f8f8;
     border: 1px solid #666;
   }
-}
+}
\ No newline at end of file
diff --git a/web/index.html b/web/index.html
index d7b4bd3..643bc5b 100644
--- a/web/index.html
+++ b/web/index.html
@@ -20,12 +20,10 @@ <h1>五笔拆解查询</h1>
   <main>
     <section class="search-container">
       <input type="text" id="query-text" placeholder="请输入汉字..." />
-      <button type="submit" id="query-button">查询</button>
+      <button type="submit" id="query-button">🔍️</button>
     </section>
     <section id="note-area">
-      <p>📝 注意这里五笔版本是<strong>1986</strong>版（王码）五笔。</p>
-      <p> <small>⚠️ 标识表示全码和容错码存在一定争议（比如起笔或末笔笔画顺序）。</small></p>
-      <p id="result"></p>
+      <p id="note-warning" class="note"></p>
     </section>
 
     <section>
diff --git a/web/js/render.js b/web/js/render.js
index 0a396c2..9eaf9d5 100644
--- a/web/js/render.js
+++ b/web/js/render.js
@@ -1,25 +1,48 @@
-const dataFile = "data/data.json";
-let data = {};
-fetch(dataFile)
-  .then((response) => {
-    if (!response.ok) {
-      throw new Error("Network response was not ok");
-    }
-    return response.json();
-  })
-  .then((jsonData) => {
-    data = jsonData;
-  })
-  .catch((error) => {
-    console.error("Read JSON error:", error);
-  });
+document.addEventListener("DOMContentLoaded", () => {
+  const dataFile = "data/data.json";
 
-// event
-document.getElementById("query-button").addEventListener("click", queryHanzi);
-document
-  .getElementById("query-text")
-  .addEventListener("keydown", function (event) {
-    if (event.key === "Enter") {
-      queryHanzi();
-    }
-  });
+  // init data
+  fetch(dataFile)
+    .then((response) => {
+      if (!response.ok) {
+        throw new Error("Network response error");
+      }
+      return response.json();
+    })
+    .then((data) => {
+      const charData = data.chars;
+      const statsData = data.stats;
+
+      const paragraphs = [
+        "<p>📝 注意这里五笔版本是<strong>1986</strong>版（王码）五笔。</p>",
+        `<p class="note">当前收录汉字: ${statsData.total}字（五笔全码: ${statsData.fullCode}, 字根拆解：${statsData.units}，图解：${statsData.segments}）。</p>`,
+        '<p class="note">⚠️ 标识表示全码和容错码存在一定争议（比如起笔或末笔笔画顺序）。</p>',
+      ];
+
+      const note = document.getElementById('note-area');
+      const para = document.getElementById('note-warning');
+      para.innerHTML = "";
+      paragraphs.forEach(text => {
+        const more = document.createElement('p');
+        more.innerHTML = text;
+        note.insertBefore(more, para);
+      });
+
+      // event
+      document
+        .getElementById("query-button")
+        .addEventListener("click", () => {
+          queryHanzi(charData, statsData);
+        });
+      document
+        .getElementById("query-text")
+        .addEventListener("keydown", function (event) {
+          if (event.key === "Enter") {
+            queryHanzi(charData, statsData);
+          }
+        });
+    })
+    .catch((error) => {
+      console.error("Read JSON error:", error);
+    });
+});
diff --git a/web/js/script.js b/web/js/script.js
index b30b489..ccaeb9d 100644
--- a/web/js/script.js
+++ b/web/js/script.js
@@ -37,7 +37,7 @@ function plotWubiSegments(target, charData, segments) {
   }
 }
 
-function getListData(keys, values) {
+function getListData(keys, values, charNames) {
   const itemList = document.createElement("ul");
   if (keys === null) {
     let arr = values;
@@ -59,7 +59,7 @@ function getListData(keys, values) {
       if (keys[index] == "收录字表") {
         const tip = document.createElement("strong");
         tip.innerText = keys[index] + ":";
-        const container = getHanziList(item);
+        const container = getHanziList(item, charNames);
         listItem.append(tip);
         listItem.append(container);
       } else {
@@ -73,8 +73,13 @@ function getListData(keys, values) {
               .map((item) => `&nbsp;&nbsp;<code>${item}</code>`)
               .join("<br>");
         } else {
+          const ignores = ["笔画数量", "备注"];
           val = item.replace(/;(.+)/, "<span>$1</span>");
-          val = `&nbsp;&nbsp;<code>${val}</code>`;
+          if (ignores.includes(keys[index])) {
+            val = `&nbsp;&nbsp;${val}`;
+          } else {
+            val = `&nbsp;&nbsp;<code>${val}</code>`;
+          }
         }
         if (item.startsWith("*")) {
           val = `&nbsp;⚠️${val}`;
@@ -91,22 +96,12 @@ function getListData(keys, values) {
   return itemList;
 }
 
-function getHanziList(sources) {
-  const names = {
-    一级: "《通用规范汉字表》（2012年）一级汉字",
-    二级: "《通用规范汉字表》（2012年）二级汉字",
-    三级: "《通用规范汉字表》（2012年）三级汉字",
-    GB2312: "《信息交换用汉字编码字符集》（GB/T 2312-1980）",
-    常用字: "《现代汉语常用字表》（1988年）常用字",
-    次常用字: "《现代汉语常用字表》（1988年）次常用字",
-    通用字: "《现代汉语通用字表》（1988年）常用字",
-    其他: "其他常用汉字",
-  };
+function getHanziList(sources, charNames) {
   // console.log(sources);
   const values = sources.split("/");
   values.forEach((item) => {
     const name = item.charAt(0);
-    names[item];
+    charNames[item];
     const div = document.createElement("div");
   });
   const container = document.createElement("div");
@@ -117,7 +112,7 @@ function getHanziList(sources) {
 
     const tooltipText = document.createElement("span");
     tooltipText.className = "tooltiptext";
-    tooltipText.textContent = names[item];
+    tooltipText.textContent = charNames[item];
 
     tooltipDiv.appendChild(tooltipText);
     container.appendChild(tooltipDiv);
@@ -125,8 +120,7 @@ function getHanziList(sources) {
   return container;
 }
 
-function createTableRow(index, data, char) {
-  const charInfo = data[char];
+function createTableRow(index, char, charInfo, charNames) {
   const row = document.createElement("tr");
 
   const indexCell = document.createElement("td");
@@ -140,18 +134,20 @@ function createTableRow(index, data, char) {
   charCell.innerHTML = `<span>${char}</span>`;
   infoCell.appendChild(
     getListData(
-      ["汉语拼音", "UNICODE", "收录字表"],
-      [charInfo.pinyin, charInfo.unicode, charInfo.source]
+      ["UNICODE", "汉语拼音", "笔画数量", "收录字表"],
+      [charInfo.unicode, charInfo.pinyin, charInfo.stroke, charInfo.source],
+      charNames
     )
   );
   codeCell.appendChild(
     getListData(
-      ["全码", "拆解", "识别"],
-      [charInfo.fullCode, charInfo.units, charInfo.flag]
+      ["全码", "拆解", "识别", "备注"],
+      [charInfo.fullCode, charInfo.units, charInfo.flag, charInfo.unitType],
+      charNames
     )
   );
   extraCell.appendChild(
-    getListData(["简码", "容错"], [charInfo.shortCode, charInfo.faultCode])
+    getListData(["简码", "容错"], [charInfo.shortCode, charInfo.faultCode], charNames)
   );
 
   row.appendChild(indexCell);
@@ -195,13 +191,14 @@ function initTable(show) {
   tableHead.append(headRow);
 }
 
-function queryHanzi() {
+function queryHanzi(charData, statsData) {
   // only top 10 chars
+  const charNames = statsData.names;
   const input = document.getElementById("query-text").value.trim();
   const chars = input.replace(/[a-zA-Z\d\s]/g, "").slice(0, 10);
 
-  const result = document.getElementById("result");
-  result.innerText = "";
+  const warning = document.getElementById("note-warning");
+  warning.innerText = "";
 
   const tableBody = document.querySelector("#data-table tbody");
   tableBody.innerHTML = ""; // clean table
@@ -209,8 +206,8 @@ function queryHanzi() {
   let valid = 0;
   Array.from(chars).forEach((char, index) => {
     // console.log(char);
-    if (char in data) {
-      row = createTableRow(index, data, char);
+    if (char in charData) {
+      row = createTableRow(index, char, charData[char], charNames);
       tableBody.appendChild(row);
       valid += 1;
     }
@@ -218,11 +215,11 @@ function queryHanzi() {
   initTable(valid !== 0);
   if (valid === 0) {
     if (chars) {
-      result.innerText = "非常用汉字，请尝试其他。";
+      warning.innerText = "🚫 非规范汉字或罕用，请尝试其他。";
     } else {
-      result.innerText = "请输入常用汉字。";
+      warning.innerText = "❗ 请输入常用规范汉字。";
     }
   } else {
-    result.innerText = "";
+    warning.innerText = "";
   }
 }