diff --git a/lib/floki.ex b/lib/floki.ex index 4ed1a7c8..ed33c9bb 100644 --- a/lib/floki.ex +++ b/lib/floki.ex @@ -538,14 +538,9 @@ defmodule Floki do opts = Keyword.validate!(opts, defaults) - cleaned_html_tree = - html - |> clean_html_tree(:js, opts[:js]) - |> clean_html_tree(:style, opts[:style]) - search_strategy = if opts[:deep], do: Floki.DeepText, else: Floki.FlatText - search_strategy.get(cleaned_html_tree, opts[:sep], opts[:include_inputs]) + search_strategy.get(html, opts) end @doc """ @@ -655,12 +650,6 @@ defmodule Floki do defp get_attribute_value([_ | rest], attr_name), do: get_attribute_value(rest, attr_name) defp get_attribute_value([], _attr_name), do: nil - defp clean_html_tree(html_tree, :js, true), do: html_tree - defp clean_html_tree(html_tree, :js, _), do: filter_out(html_tree, "script") - - defp clean_html_tree(html_tree, :style, true), do: html_tree - defp clean_html_tree(html_tree, :style, _), do: filter_out(html_tree, "style") - @doc """ Returns the nodes from a HTML tree that don't match the filter selector. diff --git a/lib/floki/deep_text.ex b/lib/floki/deep_text.ex index 51d17ddc..cafb1ba9 100644 --- a/lib/floki/deep_text.ex +++ b/lib/floki/deep_text.ex @@ -8,37 +8,51 @@ defmodule Floki.DeepText do @spec get(html_tree, binary, boolean) :: binary - def get(html_tree, sep \\ "", include_inputs? \\ false) + def get(html_tree, sep_or_opts \\ "", include_inputs? \\ false) + + def get(html_tree, opts, _) when is_list(opts) do + sep = Keyword.get(opts, :sep, "") + include_inputs? = Keyword.get(opts, :include_inputs, false) + js? = Keyword.get(opts, :js, false) + style? = Keyword.get(opts, :style, true) + + html_tree + |> get_text([], sep, include_inputs?, js?, style?) + |> IO.iodata_to_binary() + end def get(html_tree, sep, include_inputs?) do html_tree - |> get_text([], sep, include_inputs?) + |> get_text([], sep, include_inputs?, true, true) |> IO.iodata_to_binary() end - defp get_text(text, [], _sep, _) when is_binary(text), do: text - defp get_text(text, acc, "", _) when is_binary(text), do: [acc, text] - defp get_text(text, acc, sep, _) when is_binary(text), do: [acc, sep, text] + defp get_text(text, [], _sep, _, _, _) when is_binary(text), do: text + defp get_text(text, acc, "", _, _, _) when is_binary(text), do: [acc, text] + defp get_text(text, acc, sep, _, _, _) when is_binary(text), do: [acc, sep, text] - defp get_text([], acc, _sep, _), do: acc + defp get_text([], acc, _sep, _, _, _), do: acc - defp get_text([child | rest], acc, sep, include_inputs?) do - acc = get_text(child, acc, sep, include_inputs?) - get_text(rest, acc, sep, include_inputs?) + defp get_text([child | rest], acc, sep, include_inputs?, js?, style?) do + acc = get_text(child, acc, sep, include_inputs?, js?, style?) + get_text(rest, acc, sep, include_inputs?, js?, style?) end - defp get_text({:comment, _}, acc, _, _), do: acc - defp get_text({"br", _, _}, acc, _, _), do: [acc, "\n"] + defp get_text({:comment, _}, acc, _, _, _, _), do: acc + defp get_text({"br", _, _}, acc, _, _, _, _), do: [acc, "\n"] + + defp get_text({"script", _, _}, acc, _, _, false, _), do: acc + defp get_text({"style", _, _}, acc, _, _, _, false), do: acc - defp get_text({"input", attrs, _}, acc, _, true) do + defp get_text({"input", attrs, _}, acc, _, true, _, _) do [acc, Floki.TextExtractor.extract_input_value(attrs)] end - defp get_text({"textarea", attrs, _}, acc, _, true) do + defp get_text({"textarea", attrs, _}, acc, _, true, _, _) do [acc, Floki.TextExtractor.extract_input_value(attrs)] end - defp get_text({_, _, nodes}, acc, sep, include_inputs?) do - get_text(nodes, acc, sep, include_inputs?) + defp get_text({_, _, nodes}, acc, sep, include_inputs?, js?, style?) do + get_text(nodes, acc, sep, include_inputs?, js?, style?) end end diff --git a/lib/floki/flat_text.ex b/lib/floki/flat_text.ex index 595b2090..835c5783 100644 --- a/lib/floki/flat_text.ex +++ b/lib/floki/flat_text.ex @@ -13,42 +13,59 @@ defmodule Floki.FlatText do @spec get(html_tree, binary, boolean) :: binary - def get(html_nodes, sep \\ "", include_inputs? \\ false) + def get(html_nodes, sep_or_opts \\ "", include_inputs? \\ false) + + def get(html_nodes, opts, _) when is_list(opts) do + sep = Keyword.get(opts, :sep, "") + include_inputs? = Keyword.get(opts, :include_inputs, false) + js? = Keyword.get(opts, :js, false) + style? = Keyword.get(opts, :style, true) + + if is_list(html_nodes) do + text_from_nodes(html_nodes, [], 0, sep, include_inputs?, js?, style?) + else + text_from_node(html_nodes, [], 0, sep, include_inputs?, js?, style?) + end + |> IO.iodata_to_binary() + end def get(html_nodes, sep, include_inputs?) when is_list(html_nodes) do html_nodes - |> text_from_nodes([], 0, sep, include_inputs?) + |> text_from_nodes([], 0, sep, include_inputs?, true, true) |> IO.iodata_to_binary() end def get(html_node, sep, include_inputs?) do html_node - |> text_from_node([], 0, sep, include_inputs?) + |> text_from_node([], 0, sep, include_inputs?, true, true) |> IO.iodata_to_binary() end - defp text_from_nodes([], acc, _, _, _), do: acc + defp text_from_nodes([], acc, _, _, _, _, _), do: acc - defp text_from_nodes([node | rest], acc, depth, sep, include_inputs?) do - acc = text_from_node(node, acc, depth, sep, include_inputs?) - text_from_nodes(rest, acc, depth, sep, include_inputs?) + defp text_from_nodes([node | rest], acc, depth, sep, include_inputs?, js?, style?) do + acc = text_from_node(node, acc, depth, sep, include_inputs?, js?, style?) + text_from_nodes(rest, acc, depth, sep, include_inputs?, js?, style?) end - defp text_from_node({"input", attrs, []}, acc, _, _, true) do + defp text_from_node({"script", _, _}, acc, _, _, _, false, _), do: acc + defp text_from_node({"style", _, _}, acc, _, _, _, _, false), do: acc + + defp text_from_node({"input", attrs, []}, acc, _, _, true, _, _) do [acc, Floki.TextExtractor.extract_input_value(attrs)] end - defp text_from_node({"textarea", attrs, []}, acc, _, _, true) do + defp text_from_node({"textarea", attrs, []}, acc, _, _, true, _, _) do [acc, Floki.TextExtractor.extract_input_value(attrs)] end - defp text_from_node({_tag, _attrs, html_nodes}, acc, depth, sep, include_inputs?) + defp text_from_node({_tag, _attrs, html_nodes}, acc, depth, sep, include_inputs?, js?, style?) when depth < 1 do - text_from_nodes(html_nodes, acc, depth + 1, sep, include_inputs?) + text_from_nodes(html_nodes, acc, depth + 1, sep, include_inputs?, js?, style?) end - defp text_from_node(text, [], _, _sep, _) when is_binary(text), do: text - defp text_from_node(text, acc, _, "", _) when is_binary(text), do: [acc, text] - defp text_from_node(text, acc, _, sep, _) when is_binary(text), do: [acc, sep, text] - defp text_from_node(_, acc, _, _, _), do: acc + defp text_from_node(text, [], _, _sep, _, _, _) when is_binary(text), do: text + defp text_from_node(text, acc, _, "", _, _, _) when is_binary(text), do: [acc, text] + defp text_from_node(text, acc, _, sep, _, _, _) when is_binary(text), do: [acc, sep, text] + defp text_from_node(_, acc, _, _, _, _, _), do: acc end