diff --git a/application/lib/calibre/web/feeds/news.py b/application/lib/calibre/web/feeds/news.py index 36749718..7f14e64c 100644 --- a/application/lib/calibre/web/feeds/news.py +++ b/application/lib/calibre/web/feeds/news.py @@ -1070,7 +1070,7 @@ def _postprocess_html(self, soup, first_fetch, job_info): # for x in soup.find_all(attrs={attr: True}): # del x[attr] - for bad_tag in list(soup.find_all(['base', 'iframe', 'canvas', 'embed', + for bad_tag in list(soup.find_all(['base', 'iframe', 'canvas', 'embed', 'source', 'command', 'datalist', 'video', 'audio', 'noscript', 'link', 'meta', 'button'])): bad_tag.extract() diff --git a/application/lib/dictionary/mdict/mdict.py b/application/lib/dictionary/mdict/mdict.py index 834d9d4d..1d8af080 100644 --- a/application/lib/dictionary/mdict/mdict.py +++ b/application/lib/dictionary/mdict/mdict.py @@ -4,8 +4,9 @@ #!/usr/bin/env python3 # -*- coding:utf-8 -*- #stardict离线词典支持 -import os, re, zlib, json +import os from bs4 import BeautifulSoup +from application.utils import xml_escape from .readmdict import MDX try: import marisa_trie @@ -79,6 +80,8 @@ def __init__(self, fname, encoding="", substyle=False, passcode=None): return #重建索引 + #为什么不使用单独的后台任务自动重建索引?是因为运行时间还不是最重要的约束,而是服务器内存 + #如果是大词典,内存可能要爆,怎么运行都不行,如果是小词典,则时间可以接受 default_log.info(f"Building trie for {dictName}") #为了能制作大词典,mdx中这些数据都是64bit的,但是为了节省空间,这里只使用32bit保存(>LLLLLL) self.trie = marisa_trie.RecordTrie(self.TRIE_FMT, self.mdx.get_index()) #type:ignore @@ -95,6 +98,10 @@ def get(self, word): if not self.trie: return '' word = word.lower().strip() + #和mdict官方应用一样,输入:about返回词典基本信息 + if word == ':about': + return self.dictHtmlInfo() + indexes = self.trie[word] if word in self.trie else None ret = self.get_content_by_Index(indexes) if ret.startswith('@@@LINK='): @@ -119,18 +126,37 @@ def post_process(self, content): soup = BeautifulSoup(content, 'html.parser') #html.parser不会自动添加html/body - #删除图像 - for tag in soup.find_all('img'): + #浏览器不支持 entry:// 协议,会直接拦截导致无法跳转, + #预先将其转换为 https://kindleear/entry/ 前缀,然后在js里面判断这个前缀 + for tag in soup.find_all('a', href=True): + href = tag['href'] + if href.startswith('entry://'): + tag['href'] = f'https://kindleear/entry/{href[8:]}' + + #kindle对html支持很差,有一些词典又使用到这些标签 + for tag in soup.find_all(['article', 'aside', 'header', 'footer', 'nav', 'main', + 'figcaption', 'figure', 'section', 'time']): + tag.name = 'div' + + #删除多媒体资源和脚本 + for tag in list(soup.find_all(['img', 'script', 'base', 'iframe', 'canvas', 'embed', 'source', + 'command', 'datalist', 'video', 'audio', 'noscript', 'meta', 'button'])): tag.extract() - + self.adjust_css(soup) - #self.inline_css(soup) #碰到稍微复杂一些的CSS文件性能就比较低下,暂时屏蔽对CSS文件的支持 - self.remove_empty_tags(soup) + self.inline_css(soup) + #self.remove_empty_tags(soup) - body = soup.body - if body: - body.name = 'div' + tag = soup.head + if tag: + tag.extract() + #mdict质量良莠不齐,有些词典在html/body外写释义 + #所以不能直接提取body内容,直接修改为div简单粗暴也有效 + for tag in (soup.html, soup.body): + if tag: + tag.name = 'div' + return str(soup) #调整一些CSS @@ -149,8 +175,9 @@ def adjust_css(self, soup): #将外部单独css文件的样式内联到html标签中 def inline_css(self, soup): link = soup.find('link', attrs={'rel': 'stylesheet', 'href': True}) - if not link: - return + if link: + link.extract() + return #碰到稍微复杂一些的CSS文件性能就比较低下,暂时屏蔽对CSS文件的支持 link.extract() css = '' @@ -211,3 +238,15 @@ def remove_empty_tags(self, soup, preserve_tags=None): self.remove_empty_tags(tag, preserve_tags) for tag in empty_tags: tag.decompose() + + #返回当前词典的基本信息,html格式 + def dictHtmlInfo(self): + ret = [] + header = self.mdx.header.copy() + ret.append('{}
'.format(header.pop('Title', ''))) + ret.append('Description:
{}

'.format(header.pop('Description', ''))) + stylesheet = xml_escape(header.pop('StyleSheet', '').replace('\n', '\\n')) + for k,v in header.items(): + ret.append('{}:  {}
'.format(k, v)) + ret.append('StyleSheet:{}
'.format(stylesheet)) + return ''.join(ret) diff --git a/application/lib/dictionary/mdict/readmdict.py b/application/lib/dictionary/mdict/readmdict.py index b5301c9d..a20edc13 100644 --- a/application/lib/dictionary/mdict/readmdict.py +++ b/application/lib/dictionary/mdict/readmdict.py @@ -823,6 +823,106 @@ def get_content_by_Index(self, indexes) -> str: txt = b'
'.join(ret).decode(self.encoding) return self._substitute_stylesheet(txt) if self.stylesheet else txt + def compare_keys(self, key1, key2): + """ + 排序要求: + header中KeyCaseSensitive表明排序时是否大小写不敏感,为No时要转化为小写字母比较。 + header中StripKey只对mdx有效,为No,则不分词,字母、空格、符号都参与排序,为Yes,则分词,仅字母参与排序,去掉空格、符号。 + MDX的编码有utf-8,utf-16,gb18030(包括gbk,gb2313,gb18030),BIG5,ISO8859-1。 + MDD的编码为utf-16le,尽管utf-16默认也是utf-16le,但是会加前缀\xff\xfe。 + 排序:utf-16按照utf-16le编解码,按照utf-16be排序,其他按照各自编码排序。 + @param key1: the key user input + @param key2: the key from the file + @return: + """ + # mdx和mdd中的key都是bytes,查询key是str,因此str转bytes要在lower()之后进行。 + # if type(key1) == str: + # key1 = key1.encode(self._encoding) + # if type(key2) == str: + # key2 = key2.encode(self._encoding) + # Dictionary of Engineering的最后一个词条是b'\xc5ngstr\xf6m compensation pyrheliometer',其中\xc5和\xf6解码报错,因此用replace。 + key1 = self.process_str_keys(key1) + key2 = self.process_str_keys(key2) + + # if operator.__lt__(key1, key2): + # return -1 + # elif operator.__eq__(key1, key2): + # return 0 + # elif operator.__gt__(key1, key2): + # return 1 + import operator + if self.__class__.__name__ == 'MDX': + if self.encoding == 'UTF-16': + t_key1 = key1.encode('utf-16be', errors='ignore') + t_key2 = key2.encode('utf-16be', errors='ignore') + if operator.__lt__(t_key1, t_key2): + return -1 + elif operator.__eq__(t_key1, t_key2): + return 0 + elif operator.__gt__(t_key1, t_key2): + return 1 + if self.encoding == 'BIG-5': + t_key1 = key1.encode('utf-8', errors='ignore') + t_key2 = key2.encode('utf-8', errors='ignore') + if operator.__lt__(t_key1, t_key2): + return -1 + elif operator.__eq__(t_key1, t_key2): + return 0 + elif operator.__gt__(t_key1, t_key2): + return 1 + else: + t_key1 = key1.encode(self.encoding, errors='ignore') + t_key2 = key2.encode(self.encoding, errors='ignore') + if operator.__lt__(t_key1, t_key2): + return -1 + elif operator.__eq__(t_key1, t_key2): + return 0 + elif operator.__gt__(t_key1, t_key2): + return 1 + else: + t_key1 = key1.encode('utf-8', errors='ignore') + t_key2 = key2.encode('utf-8', errors='ignore') + if operator.__lt__(t_key1, t_key2): + return -1 + elif operator.__eq__(t_key1, t_key2): + return 0 + elif operator.__gt__(t_key1, t_key2): + return 1 + + def lower_str_keys(self, key): + """自动转换为小写""" + return key if self.header.get('KeyCaseSensitive') == 'Yes' else key.lower() + + def strip_key(self): + # 0:False,1:True,2:None + if 'StripKey' in self.header.keys(): + if self.header['StripKey'] == 'Yes': + self._strip_key = 1 + elif self.header['StripKey'] == 'No': + self._strip_key = 0 + else: + self._strip_key = 2 + else: + self._strip_key = 2 + + if self.__class__.__name__ == 'MDD': + self._strip_key = 0 + + def process_str_keys(self, key): + if self.__class__.__name__ == 'MDX': + if isinstance(key, bytes): + if self.encoding == 'UTF-16': + key = key.decode('utf-16le', errors='ignore') + else: + # ISO8859-1编码中文报错latin-1 UnicodeDecodeError + key = key.decode(self.encoding, errors='ignore') + else: + if isinstance(key, bytes): + key = key.decode(self.encoding) + if self._strip_key == 1: + key = re.sub(r'[ _=,.;:!?@%&#~`()\[\]<>{}/\\\$\+\-\*\^\'"\t|]', '', key) + return self.lower_str_keys(key) # 这里不能strip() + if __name__ == "__main__": import sys import os diff --git a/application/static/reader.js b/application/static/reader.js index 1b987e9c..cdd910de 100644 --- a/application/static/reader.js +++ b/application/static/reader.js @@ -4,7 +4,7 @@ var g_iframeScrollHeight = 500; //在 iframeLoadEvent 里更新 //var g_iframeClientHeight = 500; -var g_currentArticle = {}; +var g_currentArticle = {}; //{title:,src:,} var g_dictMode = false; const g_trTextContainerHeight = 350; //350px在reader.css定义tr-text-container和tr-result-text @@ -519,6 +519,25 @@ function scrollToNode(container, node) { container.scrollTop = pos; } +//高亮显示当前正在读的书 +function highlightCurrentArticle() { + var art = g_currentArticle; + if (isEmpty(art)) { + return; + } + + var navContent = document.getElementById('nav-content'); + var items = navContent.querySelectorAll('.nav-title'); + for (var i = 0; i < items.length; i++) { + var item = items[i]; + if (item.getAttribute('data-src') == art.src) { + item.style.fontWeight = 'bold'; + } else { + item.style.fontWeight = 'normal'; + } + } +} + //删除一本或多本书 function navDeleteBooks(event) { hidePopMenu(); @@ -660,7 +679,22 @@ function toggleDictMode() { } //关闭查词窗口 -function closeDictDialog() { +function closeDictDialog(event) { + //处理词典内词条跳转 + var target = event ? event.target || event.srcElement : null; + if (target && (target.tagName == 'A')) { + event.stopPropagation(); + event.preventDefault(); + var href = target.getAttribute('href') || ''; + if (href.startsWith('https://kindleear/entry/')) { + var word = href.substring(24); + if (word) { + translateWord(word); + return; + } + } + } + g_dictMode = false; document.getElementById('tr-result').style.display = 'none'; document.getElementById('corner-dict-hint').style.display = 'none'; @@ -864,6 +898,7 @@ function openArticle(article) { } hideNavbar(); closeDictDialog(); + highlightCurrentArticle(); } //打开上一篇文章 @@ -930,19 +965,22 @@ function iframeLoadEvent(evt) { adjustIFrameStyle(iframe); var doc = iframe.contentDocument || iframe.contentWindow.document; doc.addEventListener('click', function(event) { + //处理链接的点击事件 var target = event.target || event.srcElement; if (target && (target.tagName == 'A')) { event.stopPropagation(); event.preventDefault(); var href = target.getAttribute('href'); if (href && g_allowLinks) { - //window.open(href, '_blank'); window.location.href = href; //kindle不支持window.open() return; } } + + //判断是否查词典 var selection = doc.getSelection(); var text = selection.toString(); + var dictDialog = document.getElementById('tr-result'); if (g_dictMode) { text = text || getWordAtClick(event, iframe); if (text) { @@ -950,28 +988,33 @@ function iframeLoadEvent(evt) { } g_dictMode = false; document.getElementById('corner-dict-hint').style.display = 'none'; + } else if (dictDialog && dictDialog.style.display != 'none') { //关闭查词窗口 + closeDictDialog(); } else if (!text) { //没有选择文本才翻页 clickEvent(event); } }); + + //只有PC有键盘快捷键 doc.addEventListener('keydown', documentKeyDownEvent); } //每次iframe加载完成后调整其样式和容器高度 function adjustIFrameStyle(iframe) { iframe = iframe || document.getElementById('iframe'); - var doc = iframe.contentDocument || iframe.contentWindow.document; + var doc = iframe.contentWindow.document || iframe.contentDocument; var body = doc.body; + iframe.style.display = "block"; iframe.style.height = 'auto'; body.style.textAlign = 'justify'; body.style.wordWrap = 'break-word'; body.style.hyphens = 'auto'; - body.style.marginRight = '10px'; + body.style.margin = '10px 20px 10px 20px'; + body.style.paddingBottom = '20px'; body.style.fontSize = g_fontSize.toFixed(1) + 'em'; body.style.cursor = 'pointer'; body.style.webkitTapHighlightColor = 'transparent'; body.style.webkitTouchCallout = 'none'; - iframe.style.display = "block"; var images = doc.querySelectorAll('img'); for (var i = 0; i < images.length; i++) { @@ -980,9 +1023,11 @@ function adjustIFrameStyle(iframe) { } var vh = getViewportHeight(); - g_iframeScrollHeight = Math.max(doc.documentElement.scrollHeight || body.scrollHeight, vh); - //g_iframeClientHeight = Math.max(doc.documentElement.clientHeight || body.clientHeight, vh); - iframe.style.height = g_iframeScrollHeight + 'px'; + var html = doc.documentElement; + var height = Math.max(body.scrollHeight, body.clientHeight, body.offsetHeight, + html.scrollHeight, html.clientHeight, html.offsetHeight, vh) + 40; + iframe.style.height = height + 'px'; + g_iframeScrollHeight = height; } //使用键盘快捷键翻页 diff --git a/application/templates/reader.html b/application/templates/reader.html index 2d7cf9c5..b1d20e9d 100644 --- a/application/templates/reader.html +++ b/application/templates/reader.html @@ -19,7 +19,7 @@ -
+
@@ -34,7 +34,7 @@
-
X
+
X
@@ -171,10 +171,10 @@ {% autoescape off -%} {% endblock -%} diff --git a/application/view/reader.py b/application/view/reader.py index 52f4dedf..f8f0f362 100644 --- a/application/view/reader.py +++ b/application/view/reader.py @@ -77,8 +77,6 @@ def ReaderRoute(): oebBooks = json.dumps(oebBooks, ensure_ascii=False) initArticle = url_for('bpReader.ReaderArticleNoFoundRoute', tips='') params = user.cfg('reader_params') - if not params.get('allowLinks'): - params['allowLinks'] = 0 shareKey = user.share_links.get('key') if (get_locale() or '').startswith('zh'): helpPage = 'https://cdhigh.github.io/KindleEar/Chinese/reader.html' @@ -182,7 +180,7 @@ def ReaderDictRoute(user: KeUser, userDir: str): dic.refresh() engines = {name: {'databases': klass.databases} for name,klass in all_dict_engines.items()} - return render_template('dict.html', user=user, engines=engines, tips='', langMap=LangMap()) + return render_template('word_lookup.html', user=user, engines=engines, tips='', langMap=LangMap()) #Api查词 @bpReader.post("/reader/dict", endpoint='ReaderDictPost') @@ -241,7 +239,7 @@ def ReaderDictPost(user: KeUser, userDir: str): #import traceback #traceback.print_exc() definition = f'Error:
{e}' - #print(json.dumps(definition)) #TODO + print(json.dumps(definition)) #TODO return {'status': 'ok', 'word': word, 'definition': definition, 'dictname': str(inst), 'others': others}