Update src

hantang · Oct 15, 2024 · 19596ab · 19596ab
1 parent 570716e
commit 19596ab
Show file tree

Hide file tree

Showing 5 changed files with 178 additions and 113 deletions.
diff --git a/scripts/parse.py b/scripts/parse.py
@@ -7,20 +7,24 @@
 
 RENAMED_COLS = {
     "Unicode": "unicode",
-    # "全码": "fullCode",
+    "全码": "fullCode",
     "简码": "shortCode",
     "容错码": "faultCode",
     "拼音": "pinyin",
-    # "字根拆解": "units",
+    "笔画数": "stroke",
+    "字根拆解": "units",
     "字表来源": "source",
+    "笔画拆解": "segments",
     "识别码": "flag",
 }
 
 OUTPUT_COLS = [
     "unicode",
     "pinyin",
+    "stroke",
     "source",
     "freq",
+    "unitType",
     "fullCode",
     "shortCode",
     "faultCode",
@@ -29,6 +33,17 @@
     "flag",
 ]
 
+CHAR_NAMES = {
+    "一级": "《通用规范汉字表》（2012年）一级汉字",
+    "二级": "《通用规范汉字表》（2012年）二级汉字",
+    "三级": "《通用规范汉字表》（2012年）三级汉字",
+    "GB2312": "《信息交换用汉字编码字符集》（GB/T 2312-1980）",
+    "常用字": "《现代汉语常用字表》（1988年）常用字",
+    "次常用字": "《现代汉语常用字表》（1988年）次常用字",
+    "通用字": "《现代汉语通用字表》（1988年）通用字",
+    "其他": "其他常用汉字",
+}
+
 
 def read_source(data_dir: str):
     files = Path(data_dir).glob("*.tsv")
@@ -41,13 +56,50 @@ def read_source(data_dir: str):
     return df
 
 
+def get_stats(df, col="字表来源"):
+    col_temp = "字表分级"
+    col_out = "字表"
+    sources = [
+        v.strip()
+        for line in df[col].fillna("").tolist()
+        for v in line.strip().split("/")
+        if v.strip()
+    ]
+    df_stats = pd.Series(sources, name=col_temp).value_counts().reset_index()
+
+    names = CHAR_NAMES
+    df_stats[col_out] = df_stats[col_temp].apply(lambda x: names.get(x, x))
+    result = df_stats[[col_out, "count"]].to_dict("records")
+    out = {"names": names, "stats": result, "total": df.shape[0]}
+
+    for key in ["全码", "字根拆解", "笔画拆解"]:
+        out[RENAMED_COLS[key]] = df[df[key].fillna("") != ""].shape[0]
+    return out
+
+
 def _to_int_array(x: str) -> list:
     return [[int(u) for u in v.split(",") if u] for v in x.strip("*").split("/")]
 
 
-def tsv_to_json(data_dir: str, save_dir: str) -> None:
+def _unit_type(x):
+    keys = ["键名字根", "笔画字根", "成字字根"]
+    vals = [k[:2] if x[k] == 1 else "" for k in keys]
+    vals = [v for v in vals if v]
+    if vals:
+        return "字根（{}）".format("，".join(vals))
+    return ""
 
-    save_dir = Path(save_dir)
+
+def _full_code(x):
+    fullcode = x["全码"]
+    if x["识别码"]:
+        return "{};{}".format(fullcode[:-1], fullcode[-1])
+    else:
+        return fullcode
+
+
+def tsv_to_json(data_dir: str, save_path: str) -> None:
+    save_dir = Path(save_path)
     save_file = Path(save_dir, "data.json")
     if not save_dir.exists():
         logging.info(f"Create dir = {save_dir}")
@@ -59,31 +111,34 @@ def tsv_to_json(data_dir: str, save_dir: str) -> None:
         return
     logging.info(f"df = {df.shape}")
 
-    for col in ["全码", "简码", "容错码", "字根拆解", "笔画拆解", "识别码"]:
+    for col in ["全码", "简码", "容错码", "字根拆解", "笔画拆解", "识别码", '笔画数']:
         df[col] = df[col].fillna("").astype(str)
     for col in ["现代汉语语料库字频（%）", "刑红兵25亿字语料字频（百万）"]:
         df[col] = df[col].fillna(0).astype(float)
 
-    df.index = df["汉字"]
-    df = df.rename(columns=RENAMED_COLS)
-    df["fullCode"] = df[["全码", "flag"]].apply(
-        lambda x: "{};{}".format(x["全码"][:-1], x["全码"][-1]) if x["flag"] else x["全码"]
-    , axis=1)
+    df["fullCode"] = df.apply(_full_code, axis=1)
     df["units"] = df["字根拆解"].apply(lambda x: " ".join(x.split("※")))
     df["segments"] = df["笔画拆解"].apply(_to_int_array)
 
     keys = ["现代汉语语料库字频（%）", "刑红兵25亿字语料字频（百万）"]
     df["freq"] = df.apply(lambda x: [x[k] for k in keys], axis=1)
-    # keys = ["五笔常用前1500", "一级简码", "二级简码", "键名字根", "笔画字根", "成字字根"]
-    # df["level"] = df.apply(lambda x: [x[k] for k in keys], axis=1)
+    # "五笔常用前1500", "一级简码", "二级简码",
+    df["unitType"] = df.apply(_unit_type, axis=1)
+
+    stats = get_stats(df)
+    renames = {k: v for k, v in RENAMED_COLS.items() if v not in df.columns}
+    df = df.rename(columns=renames)
+    df = df.set_index("汉字")
 
     df2 = df[OUTPUT_COLS]
-    logging.info(f"output data = {df.shape}")
+    logging.info(f"output data = {df2.shape}")
 
     out = df2.to_dict("index")
+    result = {"stats": stats, "chars": out}
+
     logging.info(f"save to = {save_file}")
     with open(save_file, "w") as f:
-        json.dump(out, f, indent=None, ensure_ascii=False)
+        json.dump(result, f, indent=None, ensure_ascii=False)
 
 
 if __name__ == "__main__":

diff --git a/web/css/styles.css b/web/css/styles.css
@@ -10,6 +10,7 @@
   --area-color: #272b35;
   --code-color: #f5f5f5;
 }
+
 @font-face {
   font-family: "Wubi Units";
   src: url("../font/黑体字根.ttf");
@@ -76,40 +77,6 @@ hr {
   height: 5px;
 }
 
-button {
-  padding: 12px 24px;
-  border: none;
-  border-radius: 4px;
-  cursor: pointer;
-  font-size: 16px;
-  /* font-weight: 500; */
-  text-transform: uppercase;
-  transition: background-color 0.3s;
-  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
-  margin: 0 5px;
-}
-
-input[type="text"] {
-  color: var(--text-color);
-  background-color: var(--background-color);
-  border: 1px solid var(--primary-color);
-  border-radius: 3px;
-  padding: 8px;
-  transition: border-color 0.3s;
-  box-sizing: border-box;
-  width: 100%;
-  max-width: 300px;
-  margin-right: 10px;
-  font-size: 16px;
-  /* font-family: "Roboto Mono", monospace; */
-  border: none;
-  outline: none;
-  padding: 10px;
-  border-radius: 25px;
-  flex: 1;
-  font-size: 16px;
-}
-
 .search-container {
   display: flex;
   align-items: center;
@@ -123,21 +90,36 @@ input[type="text"] {
   margin: 2rem auto;
 }
 
+
+input[type="text"] {
+  color: var(--text-color);
+  background-color: var(--background-color);
+  width: 100%;
+  flex: 1;
+  outline: none;
+  border: none;
+  border-radius: 15px;
+  padding: 10px 20px;
+  font-size: larger;
+  margin-right: 5px;
+}
+
 button {
   background-color: var(--more-color);
   color: var(--text-color);
   border: none;
-  border-radius: 25px;
+  border-radius: 15px;
   padding: 10px 20px;
   cursor: pointer;
-  font-size: 16px;
+  font-size: larger;
   margin-left: 10px;
 }
 
 button:hover {
   background-color: var(--accent-color);
 }
 
+
 table {
   border-collapse: collapse;
   max-width: 95%;
@@ -167,7 +149,7 @@ input[type="text"]:hover {
   background-color: var(--code-color);
 }
 
-td > span {
+td>span {
   font-size: xxx-large;
   font-family: STKaiti, KaiTi, serif;
 }
@@ -176,6 +158,10 @@ td div {
   text-align: center;
 }
 
+td li div {
+  display: inline;
+}
+
 ul {
   list-style-type: none;
   padding: 0;
@@ -220,6 +206,10 @@ code span {
   text-align: center;
 }
 
+#note-area .note {
+  font-size: smaller;
+}
+
 .tooltip {
   position: relative;
   display: inline-block;
@@ -239,9 +229,11 @@ code span {
   padding: 5px 0;
   position: absolute;
   z-index: 1;
-  bottom: 125%; /* 在元素上方 */
+  bottom: 125%;
+  /* 在元素上方 */
   left: 50%;
-  margin-left: -60px; /* 调整位置 */
+  margin-left: -60px;
+  /* 调整位置 */
   opacity: 0;
   transition: opacity 0.3s;
 }
@@ -277,4 +269,4 @@ code span {
     color: #f8f8f8;
     border: 1px solid #666;
   }
-}
+}
diff --git a/web/index.html b/web/index.html
@@ -20,12 +20,10 @@ <h1>五笔拆解查询</h1>
   <main>
     <section class="search-container">
       <input type="text" id="query-text" placeholder="请输入汉字..." />
-      <button type="submit" id="query-button">查询</button>
+      <button type="submit" id="query-button">🔍️</button>
     </section>
     <section id="note-area">
-      <p>📝 注意这里五笔版本是<strong>1986</strong>版（王码）五笔。</p>
-      <p> <small>⚠️ 标识表示全码和容错码存在一定争议（比如起笔或末笔笔画顺序）。</small></p>
-      <p id="result"></p>
+      <p id="note-warning" class="note"></p>
     </section>
 
     <section>

diff --git a/web/js/render.js b/web/js/render.js
@@ -1,25 +1,48 @@
-const dataFile = "data/data.json";
-let data = {};
-fetch(dataFile)
-  .then((response) => {
-    if (!response.ok) {
-      throw new Error("Network response was not ok");
-    }
-    return response.json();
-  })
-  .then((jsonData) => {
-    data = jsonData;
-  })
-  .catch((error) => {
-    console.error("Read JSON error:", error);
-  });
+document.addEventListener("DOMContentLoaded", () => {
+  const dataFile = "data/data.json";
 
-// event
-document.getElementById("query-button").addEventListener("click", queryHanzi);
-document
-  .getElementById("query-text")
-  .addEventListener("keydown", function (event) {
-    if (event.key === "Enter") {
-      queryHanzi();
-    }
-  });
+  // init data
+  fetch(dataFile)
+    .then((response) => {
+      if (!response.ok) {
+        throw new Error("Network response error");
+      }
+      return response.json();
+    })
+    .then((data) => {
+      const charData = data.chars;
+      const statsData = data.stats;
+
+      const paragraphs = [
+        "<p>📝 注意这里五笔版本是<strong>1986</strong>版（王码）五笔。</p>",
+        `<p class="note">当前收录汉字: ${statsData.total}字（五笔全码: ${statsData.fullCode}, 字根拆解：${statsData.units}，图解：${statsData.segments}）。</p>`,
+        '<p class="note">⚠️ 标识表示全码和容错码存在一定争议（比如起笔或末笔笔画顺序）。</p>',
+      ];
+
+      const note = document.getElementById('note-area');
+      const para = document.getElementById('note-warning');
+      para.innerHTML = "";
+      paragraphs.forEach(text => {
+        const more = document.createElement('p');
+        more.innerHTML = text;
+        note.insertBefore(more, para);
+      });
+
+      // event
+      document
+        .getElementById("query-button")
+        .addEventListener("click", () => {
+          queryHanzi(charData, statsData);
+        });
+      document
+        .getElementById("query-text")
+        .addEventListener("keydown", function (event) {
+          if (event.key === "Enter") {
+            queryHanzi(charData, statsData);
+          }
+        });
+    })
+    .catch((error) => {
+      console.error("Read JSON error:", error);
+    });
+});