diff --git a/application/lib/calibre/web/feeds/news.py b/application/lib/calibre/web/feeds/news.py
index 36749718..7f14e64c 100644
--- a/application/lib/calibre/web/feeds/news.py
+++ b/application/lib/calibre/web/feeds/news.py
@@ -1070,7 +1070,7 @@ def _postprocess_html(self, soup, first_fetch, job_info):
# for x in soup.find_all(attrs={attr: True}):
# del x[attr]
- for bad_tag in list(soup.find_all(['base', 'iframe', 'canvas', 'embed',
+ for bad_tag in list(soup.find_all(['base', 'iframe', 'canvas', 'embed', 'source',
'command', 'datalist', 'video', 'audio', 'noscript', 'link', 'meta', 'button'])):
bad_tag.extract()
diff --git a/application/lib/dictionary/mdict/mdict.py b/application/lib/dictionary/mdict/mdict.py
index 834d9d4d..1d8af080 100644
--- a/application/lib/dictionary/mdict/mdict.py
+++ b/application/lib/dictionary/mdict/mdict.py
@@ -4,8 +4,9 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#stardict离线词典支持
-import os, re, zlib, json
+import os
from bs4 import BeautifulSoup
+from application.utils import xml_escape
from .readmdict import MDX
try:
import marisa_trie
@@ -79,6 +80,8 @@ def __init__(self, fname, encoding="", substyle=False, passcode=None):
return
#重建索引
+ #为什么不使用单独的后台任务自动重建索引?是因为运行时间还不是最重要的约束,而是服务器内存
+ #如果是大词典,内存可能要爆,怎么运行都不行,如果是小词典,则时间可以接受
default_log.info(f"Building trie for {dictName}")
#为了能制作大词典,mdx中这些数据都是64bit的,但是为了节省空间,这里只使用32bit保存(>LLLLLL)
self.trie = marisa_trie.RecordTrie(self.TRIE_FMT, self.mdx.get_index()) #type:ignore
@@ -95,6 +98,10 @@ def get(self, word):
if not self.trie:
return ''
word = word.lower().strip()
+ #和mdict官方应用一样,输入:about返回词典基本信息
+ if word == ':about':
+ return self.dictHtmlInfo()
+
indexes = self.trie[word] if word in self.trie else None
ret = self.get_content_by_Index(indexes)
if ret.startswith('@@@LINK='):
@@ -119,18 +126,37 @@ def post_process(self, content):
soup = BeautifulSoup(content, 'html.parser') #html.parser不会自动添加html/body
- #删除图像
- for tag in soup.find_all('img'):
+ #浏览器不支持 entry:// 协议,会直接拦截导致无法跳转,
+ #预先将其转换为 https://kindleear/entry/ 前缀,然后在js里面判断这个前缀
+ for tag in soup.find_all('a', href=True):
+ href = tag['href']
+ if href.startswith('entry://'):
+ tag['href'] = f'https://kindleear/entry/{href[8:]}'
+
+ #kindle对html支持很差,有一些词典又使用到这些标签
+ for tag in soup.find_all(['article', 'aside', 'header', 'footer', 'nav', 'main',
+ 'figcaption', 'figure', 'section', 'time']):
+ tag.name = 'div'
+
+ #删除多媒体资源和脚本
+ for tag in list(soup.find_all(['img', 'script', 'base', 'iframe', 'canvas', 'embed', 'source',
+ 'command', 'datalist', 'video', 'audio', 'noscript', 'meta', 'button'])):
tag.extract()
-
+
self.adjust_css(soup)
- #self.inline_css(soup) #碰到稍微复杂一些的CSS文件性能就比较低下,暂时屏蔽对CSS文件的支持
- self.remove_empty_tags(soup)
+ self.inline_css(soup)
+ #self.remove_empty_tags(soup)
- body = soup.body
- if body:
- body.name = 'div'
+ tag = soup.head
+ if tag:
+ tag.extract()
+ #mdict质量良莠不齐,有些词典在html/body外写释义
+ #所以不能直接提取body内容,直接修改为div简单粗暴也有效
+ for tag in (soup.html, soup.body):
+ if tag:
+ tag.name = 'div'
+
return str(soup)
#调整一些CSS
@@ -149,8 +175,9 @@ def adjust_css(self, soup):
#将外部单独css文件的样式内联到html标签中
def inline_css(self, soup):
link = soup.find('link', attrs={'rel': 'stylesheet', 'href': True})
- if not link:
- return
+ if link:
+ link.extract()
+ return #碰到稍微复杂一些的CSS文件性能就比较低下,暂时屏蔽对CSS文件的支持
link.extract()
css = ''
@@ -211,3 +238,15 @@ def remove_empty_tags(self, soup, preserve_tags=None):
self.remove_empty_tags(tag, preserve_tags)
for tag in empty_tags:
tag.decompose()
+
+ #返回当前词典的基本信息,html格式
+ def dictHtmlInfo(self):
+ ret = []
+ header = self.mdx.header.copy()
+ ret.append('{}
'.format(header.pop('Title', '')))
+ ret.append('Description:
{}
'.format(header.pop('Description', '')))
+ stylesheet = xml_escape(header.pop('StyleSheet', '').replace('\n', '\\n'))
+ for k,v in header.items():
+ ret.append('{}: {}
'.format(k, v))
+ ret.append('StyleSheet:{}
'.format(stylesheet))
+ return ''.join(ret)
diff --git a/application/lib/dictionary/mdict/readmdict.py b/application/lib/dictionary/mdict/readmdict.py
index b5301c9d..a20edc13 100644
--- a/application/lib/dictionary/mdict/readmdict.py
+++ b/application/lib/dictionary/mdict/readmdict.py
@@ -823,6 +823,106 @@ def get_content_by_Index(self, indexes) -> str:
txt = b'
'.join(ret).decode(self.encoding)
return self._substitute_stylesheet(txt) if self.stylesheet else txt
+ def compare_keys(self, key1, key2):
+ """
+ 排序要求:
+ header中KeyCaseSensitive表明排序时是否大小写不敏感,为No时要转化为小写字母比较。
+ header中StripKey只对mdx有效,为No,则不分词,字母、空格、符号都参与排序,为Yes,则分词,仅字母参与排序,去掉空格、符号。
+ MDX的编码有utf-8,utf-16,gb18030(包括gbk,gb2313,gb18030),BIG5,ISO8859-1。
+ MDD的编码为utf-16le,尽管utf-16默认也是utf-16le,但是会加前缀\xff\xfe。
+ 排序:utf-16按照utf-16le编解码,按照utf-16be排序,其他按照各自编码排序。
+ @param key1: the key user input
+ @param key2: the key from the file
+ @return:
+ """
+ # mdx和mdd中的key都是bytes,查询key是str,因此str转bytes要在lower()之后进行。
+ # if type(key1) == str:
+ # key1 = key1.encode(self._encoding)
+ # if type(key2) == str:
+ # key2 = key2.encode(self._encoding)
+ # Dictionary of Engineering的最后一个词条是b'\xc5ngstr\xf6m compensation pyrheliometer',其中\xc5和\xf6解码报错,因此用replace。
+ key1 = self.process_str_keys(key1)
+ key2 = self.process_str_keys(key2)
+
+ # if operator.__lt__(key1, key2):
+ # return -1
+ # elif operator.__eq__(key1, key2):
+ # return 0
+ # elif operator.__gt__(key1, key2):
+ # return 1
+ import operator
+ if self.__class__.__name__ == 'MDX':
+ if self.encoding == 'UTF-16':
+ t_key1 = key1.encode('utf-16be', errors='ignore')
+ t_key2 = key2.encode('utf-16be', errors='ignore')
+ if operator.__lt__(t_key1, t_key2):
+ return -1
+ elif operator.__eq__(t_key1, t_key2):
+ return 0
+ elif operator.__gt__(t_key1, t_key2):
+ return 1
+ if self.encoding == 'BIG-5':
+ t_key1 = key1.encode('utf-8', errors='ignore')
+ t_key2 = key2.encode('utf-8', errors='ignore')
+ if operator.__lt__(t_key1, t_key2):
+ return -1
+ elif operator.__eq__(t_key1, t_key2):
+ return 0
+ elif operator.__gt__(t_key1, t_key2):
+ return 1
+ else:
+ t_key1 = key1.encode(self.encoding, errors='ignore')
+ t_key2 = key2.encode(self.encoding, errors='ignore')
+ if operator.__lt__(t_key1, t_key2):
+ return -1
+ elif operator.__eq__(t_key1, t_key2):
+ return 0
+ elif operator.__gt__(t_key1, t_key2):
+ return 1
+ else:
+ t_key1 = key1.encode('utf-8', errors='ignore')
+ t_key2 = key2.encode('utf-8', errors='ignore')
+ if operator.__lt__(t_key1, t_key2):
+ return -1
+ elif operator.__eq__(t_key1, t_key2):
+ return 0
+ elif operator.__gt__(t_key1, t_key2):
+ return 1
+
+ def lower_str_keys(self, key):
+ """自动转换为小写"""
+ return key if self.header.get('KeyCaseSensitive') == 'Yes' else key.lower()
+
+ def strip_key(self):
+ # 0:False,1:True,2:None
+ if 'StripKey' in self.header.keys():
+ if self.header['StripKey'] == 'Yes':
+ self._strip_key = 1
+ elif self.header['StripKey'] == 'No':
+ self._strip_key = 0
+ else:
+ self._strip_key = 2
+ else:
+ self._strip_key = 2
+
+ if self.__class__.__name__ == 'MDD':
+ self._strip_key = 0
+
+ def process_str_keys(self, key):
+ if self.__class__.__name__ == 'MDX':
+ if isinstance(key, bytes):
+ if self.encoding == 'UTF-16':
+ key = key.decode('utf-16le', errors='ignore')
+ else:
+ # ISO8859-1编码中文报错latin-1 UnicodeDecodeError
+ key = key.decode(self.encoding, errors='ignore')
+ else:
+ if isinstance(key, bytes):
+ key = key.decode(self.encoding)
+ if self._strip_key == 1:
+ key = re.sub(r'[ _=,.;:!?@%~`()\[\]<>{}/\\\$\+\-\*\^\'"\t|]', '', key)
+ return self.lower_str_keys(key) # 这里不能strip()
+
if __name__ == "__main__":
import sys
import os
diff --git a/application/static/reader.js b/application/static/reader.js
index 1b987e9c..cdd910de 100644
--- a/application/static/reader.js
+++ b/application/static/reader.js
@@ -4,7 +4,7 @@
var g_iframeScrollHeight = 500; //在 iframeLoadEvent 里更新
//var g_iframeClientHeight = 500;
-var g_currentArticle = {};
+var g_currentArticle = {}; //{title:,src:,}
var g_dictMode = false;
const g_trTextContainerHeight = 350; //350px在reader.css定义tr-text-container和tr-result-text
@@ -519,6 +519,25 @@ function scrollToNode(container, node) {
container.scrollTop = pos;
}
+//高亮显示当前正在读的书
+function highlightCurrentArticle() {
+ var art = g_currentArticle;
+ if (isEmpty(art)) {
+ return;
+ }
+
+ var navContent = document.getElementById('nav-content');
+ var items = navContent.querySelectorAll('.nav-title');
+ for (var i = 0; i < items.length; i++) {
+ var item = items[i];
+ if (item.getAttribute('data-src') == art.src) {
+ item.style.fontWeight = 'bold';
+ } else {
+ item.style.fontWeight = 'normal';
+ }
+ }
+}
+
//删除一本或多本书
function navDeleteBooks(event) {
hidePopMenu();
@@ -660,7 +679,22 @@ function toggleDictMode() {
}
//关闭查词窗口
-function closeDictDialog() {
+function closeDictDialog(event) {
+ //处理词典内词条跳转
+ var target = event ? event.target || event.srcElement : null;
+ if (target && (target.tagName == 'A')) {
+ event.stopPropagation();
+ event.preventDefault();
+ var href = target.getAttribute('href') || '';
+ if (href.startsWith('https://kindleear/entry/')) {
+ var word = href.substring(24);
+ if (word) {
+ translateWord(word);
+ return;
+ }
+ }
+ }
+
g_dictMode = false;
document.getElementById('tr-result').style.display = 'none';
document.getElementById('corner-dict-hint').style.display = 'none';
@@ -864,6 +898,7 @@ function openArticle(article) {
}
hideNavbar();
closeDictDialog();
+ highlightCurrentArticle();
}
//打开上一篇文章
@@ -930,19 +965,22 @@ function iframeLoadEvent(evt) {
adjustIFrameStyle(iframe);
var doc = iframe.contentDocument || iframe.contentWindow.document;
doc.addEventListener('click', function(event) {
+ //处理链接的点击事件
var target = event.target || event.srcElement;
if (target && (target.tagName == 'A')) {
event.stopPropagation();
event.preventDefault();
var href = target.getAttribute('href');
if (href && g_allowLinks) {
- //window.open(href, '_blank');
window.location.href = href; //kindle不支持window.open()
return;
}
}
+
+ //判断是否查词典
var selection = doc.getSelection();
var text = selection.toString();
+ var dictDialog = document.getElementById('tr-result');
if (g_dictMode) {
text = text || getWordAtClick(event, iframe);
if (text) {
@@ -950,28 +988,33 @@ function iframeLoadEvent(evt) {
}
g_dictMode = false;
document.getElementById('corner-dict-hint').style.display = 'none';
+ } else if (dictDialog && dictDialog.style.display != 'none') { //关闭查词窗口
+ closeDictDialog();
} else if (!text) { //没有选择文本才翻页
clickEvent(event);
}
});
+
+ //只有PC有键盘快捷键
doc.addEventListener('keydown', documentKeyDownEvent);
}
//每次iframe加载完成后调整其样式和容器高度
function adjustIFrameStyle(iframe) {
iframe = iframe || document.getElementById('iframe');
- var doc = iframe.contentDocument || iframe.contentWindow.document;
+ var doc = iframe.contentWindow.document || iframe.contentDocument;
var body = doc.body;
+ iframe.style.display = "block";
iframe.style.height = 'auto';
body.style.textAlign = 'justify';
body.style.wordWrap = 'break-word';
body.style.hyphens = 'auto';
- body.style.marginRight = '10px';
+ body.style.margin = '10px 20px 10px 20px';
+ body.style.paddingBottom = '20px';
body.style.fontSize = g_fontSize.toFixed(1) + 'em';
body.style.cursor = 'pointer';
body.style.webkitTapHighlightColor = 'transparent';
body.style.webkitTouchCallout = 'none';
- iframe.style.display = "block";
var images = doc.querySelectorAll('img');
for (var i = 0; i < images.length; i++) {
@@ -980,9 +1023,11 @@ function adjustIFrameStyle(iframe) {
}
var vh = getViewportHeight();
- g_iframeScrollHeight = Math.max(doc.documentElement.scrollHeight || body.scrollHeight, vh);
- //g_iframeClientHeight = Math.max(doc.documentElement.clientHeight || body.clientHeight, vh);
- iframe.style.height = g_iframeScrollHeight + 'px';
+ var html = doc.documentElement;
+ var height = Math.max(body.scrollHeight, body.clientHeight, body.offsetHeight,
+ html.scrollHeight, html.clientHeight, html.offsetHeight, vh) + 40;
+ iframe.style.height = height + 'px';
+ g_iframeScrollHeight = height;
}
//使用键盘快捷键翻页
diff --git a/application/templates/reader.html b/application/templates/reader.html
index 2d7cf9c5..b1d20e9d 100644
--- a/application/templates/reader.html
+++ b/application/templates/reader.html
@@ -19,7 +19,7 @@
-
+
-
X
+
X
@@ -171,10 +171,10 @@
{% autoescape off -%}
{% endblock -%}
diff --git a/application/view/reader.py b/application/view/reader.py
index 52f4dedf..f8f0f362 100644
--- a/application/view/reader.py
+++ b/application/view/reader.py
@@ -77,8 +77,6 @@ def ReaderRoute():
oebBooks = json.dumps(oebBooks, ensure_ascii=False)
initArticle = url_for('bpReader.ReaderArticleNoFoundRoute', tips='')
params = user.cfg('reader_params')
- if not params.get('allowLinks'):
- params['allowLinks'] = 0
shareKey = user.share_links.get('key')
if (get_locale() or '').startswith('zh'):
helpPage = 'https://cdhigh.github.io/KindleEar/Chinese/reader.html'
@@ -182,7 +180,7 @@ def ReaderDictRoute(user: KeUser, userDir: str):
dic.refresh()
engines = {name: {'databases': klass.databases} for name,klass in all_dict_engines.items()}
- return render_template('dict.html', user=user, engines=engines, tips='', langMap=LangMap())
+ return render_template('word_lookup.html', user=user, engines=engines, tips='', langMap=LangMap())
#Api查词
@bpReader.post("/reader/dict", endpoint='ReaderDictPost')
@@ -241,7 +239,7 @@ def ReaderDictPost(user: KeUser, userDir: str):
#import traceback
#traceback.print_exc()
definition = f'Error:
{e}'
- #print(json.dumps(definition)) #TODO
+ print(json.dumps(definition)) #TODO
return {'status': 'ok', 'word': word, 'definition': definition,
'dictname': str(inst), 'others': others}