Skip to content

Commit

Permalink
Update src
Browse files Browse the repository at this point in the history
  • Loading branch information
hantang committed Oct 15, 2024
1 parent 570716e commit 19596ab
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 113 deletions.
83 changes: 69 additions & 14 deletions scripts/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,24 @@

RENAMED_COLS = {
"Unicode": "unicode",
# "全码": "fullCode",
"全码": "fullCode",
"简码": "shortCode",
"容错码": "faultCode",
"拼音": "pinyin",
# "字根拆解": "units",
"笔画数": "stroke",
"字根拆解": "units",
"字表来源": "source",
"笔画拆解": "segments",
"识别码": "flag",
}

OUTPUT_COLS = [
"unicode",
"pinyin",
"stroke",
"source",
"freq",
"unitType",
"fullCode",
"shortCode",
"faultCode",
Expand All @@ -29,6 +33,17 @@
"flag",
]

CHAR_NAMES = {
"一级": "《通用规范汉字表》(2012年)一级汉字",
"二级": "《通用规范汉字表》(2012年)二级汉字",
"三级": "《通用规范汉字表》(2012年)三级汉字",
"GB2312": "《信息交换用汉字编码字符集》(GB/T 2312-1980)",
"常用字": "《现代汉语常用字表》(1988年)常用字",
"次常用字": "《现代汉语常用字表》(1988年)次常用字",
"通用字": "《现代汉语通用字表》(1988年)通用字",
"其他": "其他常用汉字",
}


def read_source(data_dir: str):
files = Path(data_dir).glob("*.tsv")
Expand All @@ -41,13 +56,50 @@ def read_source(data_dir: str):
return df


def get_stats(df, col="字表来源"):
col_temp = "字表分级"
col_out = "字表"
sources = [
v.strip()
for line in df[col].fillna("").tolist()
for v in line.strip().split("/")
if v.strip()
]
df_stats = pd.Series(sources, name=col_temp).value_counts().reset_index()

names = CHAR_NAMES
df_stats[col_out] = df_stats[col_temp].apply(lambda x: names.get(x, x))
result = df_stats[[col_out, "count"]].to_dict("records")
out = {"names": names, "stats": result, "total": df.shape[0]}

for key in ["全码", "字根拆解", "笔画拆解"]:
out[RENAMED_COLS[key]] = df[df[key].fillna("") != ""].shape[0]
return out


def _to_int_array(x: str) -> list:
return [[int(u) for u in v.split(",") if u] for v in x.strip("*").split("/")]


def tsv_to_json(data_dir: str, save_dir: str) -> None:
def _unit_type(x):
keys = ["键名字根", "笔画字根", "成字字根"]
vals = [k[:2] if x[k] == 1 else "" for k in keys]
vals = [v for v in vals if v]
if vals:
return "字根({})".format(",".join(vals))
return ""

save_dir = Path(save_dir)

def _full_code(x):
fullcode = x["全码"]
if x["识别码"]:
return "{};{}".format(fullcode[:-1], fullcode[-1])
else:
return fullcode


def tsv_to_json(data_dir: str, save_path: str) -> None:
save_dir = Path(save_path)
save_file = Path(save_dir, "data.json")
if not save_dir.exists():
logging.info(f"Create dir = {save_dir}")
Expand All @@ -59,31 +111,34 @@ def tsv_to_json(data_dir: str, save_dir: str) -> None:
return
logging.info(f"df = {df.shape}")

for col in ["全码", "简码", "容错码", "字根拆解", "笔画拆解", "识别码"]:
for col in ["全码", "简码", "容错码", "字根拆解", "笔画拆解", "识别码", '笔画数']:
df[col] = df[col].fillna("").astype(str)
for col in ["现代汉语语料库字频(%)", "刑红兵25亿字语料字频(百万)"]:
df[col] = df[col].fillna(0).astype(float)

df.index = df["汉字"]
df = df.rename(columns=RENAMED_COLS)
df["fullCode"] = df[["全码", "flag"]].apply(
lambda x: "{};{}".format(x["全码"][:-1], x["全码"][-1]) if x["flag"] else x["全码"]
, axis=1)
df["fullCode"] = df.apply(_full_code, axis=1)
df["units"] = df["字根拆解"].apply(lambda x: " ".join(x.split("※")))
df["segments"] = df["笔画拆解"].apply(_to_int_array)

keys = ["现代汉语语料库字频(%)", "刑红兵25亿字语料字频(百万)"]
df["freq"] = df.apply(lambda x: [x[k] for k in keys], axis=1)
# keys = ["五笔常用前1500", "一级简码", "二级简码", "键名字根", "笔画字根", "成字字根"]
# df["level"] = df.apply(lambda x: [x[k] for k in keys], axis=1)
# "五笔常用前1500", "一级简码", "二级简码",
df["unitType"] = df.apply(_unit_type, axis=1)

stats = get_stats(df)
renames = {k: v for k, v in RENAMED_COLS.items() if v not in df.columns}
df = df.rename(columns=renames)
df = df.set_index("汉字")

df2 = df[OUTPUT_COLS]
logging.info(f"output data = {df.shape}")
logging.info(f"output data = {df2.shape}")

out = df2.to_dict("index")
result = {"stats": stats, "chars": out}

logging.info(f"save to = {save_file}")
with open(save_file, "w") as f:
json.dump(out, f, indent=None, ensure_ascii=False)
json.dump(result, f, indent=None, ensure_ascii=False)


if __name__ == "__main__":
Expand Down
72 changes: 32 additions & 40 deletions web/css/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
--area-color: #272b35;
--code-color: #f5f5f5;
}

@font-face {
font-family: "Wubi Units";
src: url("../font/黑体字根.ttf");
Expand Down Expand Up @@ -76,40 +77,6 @@ hr {
height: 5px;
}

button {
padding: 12px 24px;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 16px;
/* font-weight: 500; */
text-transform: uppercase;
transition: background-color 0.3s;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
margin: 0 5px;
}

input[type="text"] {
color: var(--text-color);
background-color: var(--background-color);
border: 1px solid var(--primary-color);
border-radius: 3px;
padding: 8px;
transition: border-color 0.3s;
box-sizing: border-box;
width: 100%;
max-width: 300px;
margin-right: 10px;
font-size: 16px;
/* font-family: "Roboto Mono", monospace; */
border: none;
outline: none;
padding: 10px;
border-radius: 25px;
flex: 1;
font-size: 16px;
}

.search-container {
display: flex;
align-items: center;
Expand All @@ -123,21 +90,36 @@ input[type="text"] {
margin: 2rem auto;
}


input[type="text"] {
color: var(--text-color);
background-color: var(--background-color);
width: 100%;
flex: 1;
outline: none;
border: none;
border-radius: 15px;
padding: 10px 20px;
font-size: larger;
margin-right: 5px;
}

button {
background-color: var(--more-color);
color: var(--text-color);
border: none;
border-radius: 25px;
border-radius: 15px;
padding: 10px 20px;
cursor: pointer;
font-size: 16px;
font-size: larger;
margin-left: 10px;
}

button:hover {
background-color: var(--accent-color);
}


table {
border-collapse: collapse;
max-width: 95%;
Expand Down Expand Up @@ -167,7 +149,7 @@ input[type="text"]:hover {
background-color: var(--code-color);
}

td > span {
td>span {
font-size: xxx-large;
font-family: STKaiti, KaiTi, serif;
}
Expand All @@ -176,6 +158,10 @@ td div {
text-align: center;
}

td li div {
display: inline;
}

ul {
list-style-type: none;
padding: 0;
Expand Down Expand Up @@ -220,6 +206,10 @@ code span {
text-align: center;
}

#note-area .note {
font-size: smaller;
}

.tooltip {
position: relative;
display: inline-block;
Expand All @@ -239,9 +229,11 @@ code span {
padding: 5px 0;
position: absolute;
z-index: 1;
bottom: 125%; /* 在元素上方 */
bottom: 125%;
/* 在元素上方 */
left: 50%;
margin-left: -60px; /* 调整位置 */
margin-left: -60px;
/* 调整位置 */
opacity: 0;
transition: opacity 0.3s;
}
Expand Down Expand Up @@ -277,4 +269,4 @@ code span {
color: #f8f8f8;
border: 1px solid #666;
}
}
}
6 changes: 2 additions & 4 deletions web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,10 @@ <h1>五笔拆解查询</h1>
<main>
<section class="search-container">
<input type="text" id="query-text" placeholder="请输入汉字..." />
<button type="submit" id="query-button">查询</button>
<button type="submit" id="query-button">🔍️</button>
</section>
<section id="note-area">
<p>📝 注意这里五笔版本是<strong>1986</strong>版(王码)五笔。</p>
<p> <small>⚠️ 标识表示全码和容错码存在一定争议(比如起笔或末笔笔画顺序)。</small></p>
<p id="result"></p>
<p id="note-warning" class="note"></p>
</section>

<section>
Expand Down
71 changes: 47 additions & 24 deletions web/js/render.js
Original file line number Diff line number Diff line change
@@ -1,25 +1,48 @@
const dataFile = "data/data.json";
let data = {};
fetch(dataFile)
.then((response) => {
if (!response.ok) {
throw new Error("Network response was not ok");
}
return response.json();
})
.then((jsonData) => {
data = jsonData;
})
.catch((error) => {
console.error("Read JSON error:", error);
});
document.addEventListener("DOMContentLoaded", () => {
const dataFile = "data/data.json";

// event
document.getElementById("query-button").addEventListener("click", queryHanzi);
document
.getElementById("query-text")
.addEventListener("keydown", function (event) {
if (event.key === "Enter") {
queryHanzi();
}
});
// init data
fetch(dataFile)
.then((response) => {
if (!response.ok) {
throw new Error("Network response error");
}
return response.json();
})
.then((data) => {
const charData = data.chars;
const statsData = data.stats;

const paragraphs = [
"<p>📝 注意这里五笔版本是<strong>1986</strong>版(王码)五笔。</p>",
`<p class="note">当前收录汉字: ${statsData.total}字(五笔全码: ${statsData.fullCode}, 字根拆解:${statsData.units},图解:${statsData.segments})。</p>`,
'<p class="note">⚠️ 标识表示全码和容错码存在一定争议(比如起笔或末笔笔画顺序)。</p>',
];

const note = document.getElementById('note-area');
const para = document.getElementById('note-warning');
para.innerHTML = "";
paragraphs.forEach(text => {
const more = document.createElement('p');
more.innerHTML = text;
note.insertBefore(more, para);
});

// event
document
.getElementById("query-button")
.addEventListener("click", () => {
queryHanzi(charData, statsData);
});
document
.getElementById("query-text")
.addEventListener("keydown", function (event) {
if (event.key === "Enter") {
queryHanzi(charData, statsData);
}
});
})
.catch((error) => {
console.error("Read JSON error:", error);
});
});
Loading

0 comments on commit 19596ab

Please sign in to comment.