From 2c7365da095389c9ae7af93c2eabe494c7ba8ac5 Mon Sep 17 00:00:00 2001 From: davidmezzetti <561939+davidmezzetti@users.noreply.github.com> Date: Thu, 22 Apr 2021 13:31:07 -0400 Subject: [PATCH] Use lxml to help generate clean HTML --- examples/search.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/search.py b/examples/search.py index e9f2717..0d7b550 100644 --- a/examples/search.py +++ b/examples/search.py @@ -1,8 +1,8 @@ """ Search a paperai index. -Requires streamlit to be installed. - pip install streamlit +Requires streamlit and lxml to be installed. + pip install streamlit lxml """ import os @@ -12,6 +12,8 @@ import pandas as pd import streamlit as st +from lxml.html.clean import clean_html + from paperai.models import Models from paperai.query import Query @@ -56,12 +58,11 @@ def search(self, query, topn, threshold): # Print each result, sorted by max score descending for uid in sorted(documents, key=lambda k: sum([x[0] for x in documents[k]]), reverse=True): - cur.execute("SELECT Title, Published, Publication, Design, Size, Sample, Method, Entry, Id, Reference " + + cur.execute("SELECT Title, Published, Publication, Design, Size, Sample, Method, Entry, Id, Reference " + "FROM articles WHERE id = ?", [uid]) article = cur.fetchone() - matches = "\n".join([text for _, text in documents[uid]]) - matches = matches.replace("<", "<").replace(">", ">").replace("&", "&") + matches = "
".join([text for _, text in documents[uid]]) title = "%s" % (article[9], article[0]) @@ -94,7 +95,8 @@ def run(self): st.markdown("

%d results

" % len(df), unsafe_allow_html=True) if not df.empty: - st.write(df[columns].to_html(escape=False, index=False), unsafe_allow_html=True) + html = df[columns].to_html(escape=False, index=False) + st.write(clean_html(html), unsafe_allow_html=True) @st.cache(allow_output_mutation=True) def create(path): @@ -113,7 +115,7 @@ def create(path): if len(sys.argv) <= 1 or not os.path.isdir(sys.argv[1]): st.error("Path to embeddings index not present or invalid") - else: + else: st.set_page_config(layout="wide") # Create and run application