Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 1 addition & 12 deletions lib/floki.ex
Original file line number Diff line number Diff line change
Expand Up @@ -538,14 +538,9 @@ defmodule Floki do

opts = Keyword.validate!(opts, defaults)

cleaned_html_tree =
html
|> clean_html_tree(:js, opts[:js])
|> clean_html_tree(:style, opts[:style])

search_strategy = if opts[:deep], do: Floki.DeepText, else: Floki.FlatText

search_strategy.get(cleaned_html_tree, opts[:sep], opts[:include_inputs])
search_strategy.get(html, opts)
end

@doc """
Expand Down Expand Up @@ -655,12 +650,6 @@ defmodule Floki do
defp get_attribute_value([_ | rest], attr_name), do: get_attribute_value(rest, attr_name)
defp get_attribute_value([], _attr_name), do: nil

defp clean_html_tree(html_tree, :js, true), do: html_tree
defp clean_html_tree(html_tree, :js, _), do: filter_out(html_tree, "script")

defp clean_html_tree(html_tree, :style, true), do: html_tree
defp clean_html_tree(html_tree, :style, _), do: filter_out(html_tree, "style")

@doc """
Returns the nodes from a HTML tree that don't match the filter selector.

Expand Down
44 changes: 29 additions & 15 deletions lib/floki/deep_text.ex
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,51 @@ defmodule Floki.DeepText do

@spec get(html_tree, binary, boolean) :: binary

def get(html_tree, sep \\ "", include_inputs? \\ false)
def get(html_tree, sep_or_opts \\ "", include_inputs? \\ false)

def get(html_tree, opts, _) when is_list(opts) do
sep = Keyword.get(opts, :sep, "")
include_inputs? = Keyword.get(opts, :include_inputs, false)
js? = Keyword.get(opts, :js, false)
style? = Keyword.get(opts, :style, true)

html_tree
|> get_text([], sep, include_inputs?, js?, style?)
|> IO.iodata_to_binary()
end

def get(html_tree, sep, include_inputs?) do
html_tree
|> get_text([], sep, include_inputs?)
|> get_text([], sep, include_inputs?, true, true)
|> IO.iodata_to_binary()
end

defp get_text(text, [], _sep, _) when is_binary(text), do: text
defp get_text(text, acc, "", _) when is_binary(text), do: [acc, text]
defp get_text(text, acc, sep, _) when is_binary(text), do: [acc, sep, text]
defp get_text(text, [], _sep, _, _, _) when is_binary(text), do: text
defp get_text(text, acc, "", _, _, _) when is_binary(text), do: [acc, text]
defp get_text(text, acc, sep, _, _, _) when is_binary(text), do: [acc, sep, text]

defp get_text([], acc, _sep, _), do: acc
defp get_text([], acc, _sep, _, _, _), do: acc

defp get_text([child | rest], acc, sep, include_inputs?) do
acc = get_text(child, acc, sep, include_inputs?)
get_text(rest, acc, sep, include_inputs?)
defp get_text([child | rest], acc, sep, include_inputs?, js?, style?) do
acc = get_text(child, acc, sep, include_inputs?, js?, style?)
get_text(rest, acc, sep, include_inputs?, js?, style?)
end

defp get_text({:comment, _}, acc, _, _), do: acc
defp get_text({"br", _, _}, acc, _, _), do: [acc, "\n"]
defp get_text({:comment, _}, acc, _, _, _, _), do: acc
defp get_text({"br", _, _}, acc, _, _, _, _), do: [acc, "\n"]

defp get_text({"script", _, _}, acc, _, _, false, _), do: acc
defp get_text({"style", _, _}, acc, _, _, _, false), do: acc

defp get_text({"input", attrs, _}, acc, _, true) do
defp get_text({"input", attrs, _}, acc, _, true, _, _) do
[acc, Floki.TextExtractor.extract_input_value(attrs)]
end

defp get_text({"textarea", attrs, _}, acc, _, true) do
defp get_text({"textarea", attrs, _}, acc, _, true, _, _) do
[acc, Floki.TextExtractor.extract_input_value(attrs)]
end

defp get_text({_, _, nodes}, acc, sep, include_inputs?) do
get_text(nodes, acc, sep, include_inputs?)
defp get_text({_, _, nodes}, acc, sep, include_inputs?, js?, style?) do
get_text(nodes, acc, sep, include_inputs?, js?, style?)
end
end
47 changes: 32 additions & 15 deletions lib/floki/flat_text.ex
Original file line number Diff line number Diff line change
Expand Up @@ -13,42 +13,59 @@ defmodule Floki.FlatText do

@spec get(html_tree, binary, boolean) :: binary

def get(html_nodes, sep \\ "", include_inputs? \\ false)
def get(html_nodes, sep_or_opts \\ "", include_inputs? \\ false)

def get(html_nodes, opts, _) when is_list(opts) do
sep = Keyword.get(opts, :sep, "")
include_inputs? = Keyword.get(opts, :include_inputs, false)
js? = Keyword.get(opts, :js, false)
style? = Keyword.get(opts, :style, true)

if is_list(html_nodes) do
text_from_nodes(html_nodes, [], 0, sep, include_inputs?, js?, style?)
else
text_from_node(html_nodes, [], 0, sep, include_inputs?, js?, style?)
end
|> IO.iodata_to_binary()
end

def get(html_nodes, sep, include_inputs?) when is_list(html_nodes) do
html_nodes
|> text_from_nodes([], 0, sep, include_inputs?)
|> text_from_nodes([], 0, sep, include_inputs?, true, true)
|> IO.iodata_to_binary()
end

def get(html_node, sep, include_inputs?) do
html_node
|> text_from_node([], 0, sep, include_inputs?)
|> text_from_node([], 0, sep, include_inputs?, true, true)
|> IO.iodata_to_binary()
end

defp text_from_nodes([], acc, _, _, _), do: acc
defp text_from_nodes([], acc, _, _, _, _, _), do: acc

defp text_from_nodes([node | rest], acc, depth, sep, include_inputs?) do
acc = text_from_node(node, acc, depth, sep, include_inputs?)
text_from_nodes(rest, acc, depth, sep, include_inputs?)
defp text_from_nodes([node | rest], acc, depth, sep, include_inputs?, js?, style?) do
acc = text_from_node(node, acc, depth, sep, include_inputs?, js?, style?)
text_from_nodes(rest, acc, depth, sep, include_inputs?, js?, style?)
end

defp text_from_node({"input", attrs, []}, acc, _, _, true) do
defp text_from_node({"script", _, _}, acc, _, _, _, false, _), do: acc
defp text_from_node({"style", _, _}, acc, _, _, _, _, false), do: acc

defp text_from_node({"input", attrs, []}, acc, _, _, true, _, _) do
[acc, Floki.TextExtractor.extract_input_value(attrs)]
end

defp text_from_node({"textarea", attrs, []}, acc, _, _, true) do
defp text_from_node({"textarea", attrs, []}, acc, _, _, true, _, _) do
[acc, Floki.TextExtractor.extract_input_value(attrs)]
end

defp text_from_node({_tag, _attrs, html_nodes}, acc, depth, sep, include_inputs?)
defp text_from_node({_tag, _attrs, html_nodes}, acc, depth, sep, include_inputs?, js?, style?)
when depth < 1 do
text_from_nodes(html_nodes, acc, depth + 1, sep, include_inputs?)
text_from_nodes(html_nodes, acc, depth + 1, sep, include_inputs?, js?, style?)
end

defp text_from_node(text, [], _, _sep, _) when is_binary(text), do: text
defp text_from_node(text, acc, _, "", _) when is_binary(text), do: [acc, text]
defp text_from_node(text, acc, _, sep, _) when is_binary(text), do: [acc, sep, text]
defp text_from_node(_, acc, _, _, _), do: acc
defp text_from_node(text, [], _, _sep, _, _, _) when is_binary(text), do: text
defp text_from_node(text, acc, _, "", _, _, _) when is_binary(text), do: [acc, text]
defp text_from_node(text, acc, _, sep, _, _, _) when is_binary(text), do: [acc, sep, text]
defp text_from_node(_, acc, _, _, _, _, _), do: acc
end
Loading