Skip to content

Commit

Permalink
Better HTML entity handling, closes #9
Browse files Browse the repository at this point in the history
  • Loading branch information
simonw committed Feb 11, 2021
1 parent 44e58ba commit 95a4905
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 9 deletions.
17 changes: 14 additions & 3 deletions evernote_to_sqlite/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import base64
import datetime
import hashlib
import html.entities
import re


def find_all_tags(fp, tags, progress_callback=None):
Expand All @@ -26,9 +28,8 @@ def save_note(db, note):
title = note.find("title").text
created = note.find("created").text
updated = note.find("updated").text
# Some content has   which breaks the XML parser
content_xml = note.find("content").text.replace(" ", "")
content = ET.tostring(ET.fromstring(content_xml.strip())).decode("utf-8")
content_xml = resolve_entities(note.find("content").text.strip())
content = ET.tostring(ET.fromstring(content_xml)).decode("utf-8")
row = {
"title": title,
"content": content,
Expand Down Expand Up @@ -98,3 +99,13 @@ def ensure_indexes(db):

def convert_datetime(s):
return datetime.datetime.strptime(s, "%Y%m%dT%H%M%SZ").isoformat()


_entities_re = re.compile(r"&(\w+);")


def resolve_entities(s):
# Replace all   entities with their unicode equivalents
return _entities_re.sub(
lambda m: html.entities.entitydefs.get(m.group(1), m.group(1)), s
)
2 changes: 1 addition & 1 deletion tests/example-note.enex
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export3.dtd">
<en-export export-date="20201011T235248Z" application="Evernote" version="Evernote Mac 7.14 (458265)">
<note><title>Example note with images</title><content><![CDATA[<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>This note includes two images.</div><div><br /></div><div><span style="font-weight: bold;">The Python logo</span></div><div><br /></div><div><en-media hash="61098c2c541de7f0a907c301dd6542da" type="image/svg+xml" width="125" /><br /></div><div><br /></div><div><span style="font-weight: bold;">The Evernote logo</span></div><div><br /></div><div><en-media hash="91bd26175acac0b2ffdb6efac199f8ca" type="image/svg+xml" width="125" /><br /></div><div><br /></div><div>This image contains text:</div><div><br /></div><div><en-media hash="76dd28b07797cc9f3f129c4871c5293c" type="image/png" /></div><div><br /></div></en-note>]]></content><created>20201011T212822Z</created><updated>20201011T233038Z</updated><note-attributes><latitude>37.77742571705006</latitude><longitude>-122.4256495114116</longitude><altitude>23.16121864318848</altitude><author>Simon Willison</author><source>desktop.mac</source><reminder-order>0</reminder-order></note-attributes><resource><data encoding="base64">PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8v
<note><title>Example note with images</title><content><![CDATA[<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>This note includes two images. &scaron;.</div><div><br /></div><div><span style="font-weight: bold;">The Python logo</span></div><div><br /></div><div><en-media hash="61098c2c541de7f0a907c301dd6542da" type="image/svg+xml" width="125" /><br /></div><div><br /></div><div><span style="font-weight: bold;">The Evernote logo</span></div><div><br /></div><div><en-media hash="91bd26175acac0b2ffdb6efac199f8ca" type="image/svg+xml" width="125" /><br /></div><div><br /></div><div>This image contains text:</div><div><br /></div><div><en-media hash="76dd28b07797cc9f3f129c4871c5293c" type="image/png" /></div><div><br /></div></en-note>]]></content><created>20201011T212822Z</created><updated>20201011T233038Z</updated><note-attributes><latitude>37.77742571705006</latitude><longitude>-122.4256495114116</longitude><altitude>23.16121864318848</altitude><author>Simon Willison</author><source>desktop.mac</source><reminder-order>0</reminder-order></note-attributes><resource><data encoding="base64">PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8v
d3d3LnczLm9yZy8xOTk5L3hsaW5rIgphcmlhLWxhYmVsPSJQeXRob24iIHJvbGU9ImltZyIKdmlld0Jv
eD0iMCAwIDUxMiA1MTIiPjxyZWN0CndpZHRoPSI1MTIiIGhlaWdodD0iNTEyIgpyeD0iMTUlIgpmaWxs
PSIjZmZmIi8+PGcgZmlsbD0iIzVhOWZkNCI+PHBhdGggaWQ9InAiIGQ9Ik0yNTQgNjRjLTE2IDAtMzEg
Expand Down
10 changes: 5 additions & 5 deletions tests/test_evernote_to_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ def test_enex(tmpdir):
}
assert list(db["notes"].rows) == [
{
"id": "8e2d6cef463bf974fe15c145d02dcfb90e4dc2af",
"id": "1ea8b9baeca91343cdb6a12d44f5cdb2edf5f2e5",
"title": "Example note with images",
"content": '<en-note><div>This note includes two images.</div><div><br /></div><div><span style="font-weight: bold;">The Python logo</span></div><div><br /></div><div><en-media hash="61098c2c541de7f0a907c301dd6542da" type="image/svg+xml" width="125" /><br /></div><div><br /></div><div><span style="font-weight: bold;">The Evernote logo</span></div><div><br /></div><div><en-media hash="91bd26175acac0b2ffdb6efac199f8ca" type="image/svg+xml" width="125" /><br /></div><div><br /></div><div>This image contains text:</div><div><br /></div><div><en-media hash="76dd28b07797cc9f3f129c4871c5293c" type="image/png" /></div><div><br /></div></en-note>',
"content": '<en-note><div>This note includes two images. &#353;.</div><div><br /></div><div><span style="font-weight: bold;">The Python logo</span></div><div><br /></div><div><en-media hash="61098c2c541de7f0a907c301dd6542da" type="image/svg+xml" width="125" /><br /></div><div><br /></div><div><span style="font-weight: bold;">The Evernote logo</span></div><div><br /></div><div><en-media hash="91bd26175acac0b2ffdb6efac199f8ca" type="image/svg+xml" width="125" /><br /></div><div><br /></div><div>This image contains text:</div><div><br /></div><div><en-media hash="76dd28b07797cc9f3f129c4871c5293c" type="image/png" /></div><div><br /></div></en-note>',
"created": "2020-10-11T21:28:22",
"updated": "2020-10-11T23:30:38",
"latitude": "37.77742571705006",
Expand Down Expand Up @@ -96,15 +96,15 @@ def test_enex(tmpdir):
]
assert list(db["note_resources"].rows) == [
{
"note_id": "8e2d6cef463bf974fe15c145d02dcfb90e4dc2af",
"note_id": "1ea8b9baeca91343cdb6a12d44f5cdb2edf5f2e5",
"resource_id": "61098c2c541de7f0a907c301dd6542da",
},
{
"note_id": "8e2d6cef463bf974fe15c145d02dcfb90e4dc2af",
"note_id": "1ea8b9baeca91343cdb6a12d44f5cdb2edf5f2e5",
"resource_id": "91bd26175acac0b2ffdb6efac199f8ca",
},
{
"note_id": "8e2d6cef463bf974fe15c145d02dcfb90e4dc2af",
"note_id": "1ea8b9baeca91343cdb6a12d44f5cdb2edf5f2e5",
"resource_id": "76dd28b07797cc9f3f129c4871c5293c",
},
]
Expand Down

0 comments on commit 95a4905

Please sign in to comment.