Pleroma/lib/pleroma/web/rich_media/parser.ex

# Pleroma: A lightweight social networking server
# Copyright © 2017-2020 Pleroma Authors <https://pleroma.social/>
# SPDX-License-Identifier: AGPL-3.0-only

defmodule Pleroma.Web.RichMedia.Parser do
  @options [
    pool: :media,
    max_body: 2_000_000
  ]

  defp parsers do
    Pleroma.Config.get([:rich_media, :parsers])
  end

  def parse(nil), do: {:error, "No URL provided"}

  if Pleroma.Config.get(:env) == :test do
    def parse(url), do: parse_url(url)
  else
    def parse(url) do
      try do
        Cachex.fetch!(:rich_media_cache, url, fn _ ->
          {:commit, parse_url(url)}
        end)
        |> set_ttl_based_on_image(url)
      rescue
        e ->
          {:error, "Cachex error: #{inspect(e)}"}
      end
    end
  end

  @doc """
  Set the rich media cache based on the expiration time of image.

  Adopt behaviour `Pleroma.Web.RichMedia.Parser.TTL`

  ## Example

      defmodule MyModule do
        @behaviour Pleroma.Web.RichMedia.Parser.TTL
        def ttl(data, url) do
          image_url = Map.get(data, :image)
          # do some parsing in the url and get the ttl of the image
          # and return ttl is unix time
          parse_ttl_from_url(image_url)
        end
      end

  Define the module in the config

      config :pleroma, :rich_media,
        ttl_setters: [MyModule]
  """
  def set_ttl_based_on_image({:ok, data}, url) do
    with {:ok, nil} <- Cachex.ttl(:rich_media_cache, url),
         ttl when is_number(ttl) <- get_ttl_from_image(data, url) do
      Cachex.expire_at(:rich_media_cache, url, ttl * 1000)
      {:ok, data}
    else
      _ ->
        {:ok, data}
    end
  end

  defp get_ttl_from_image(data, url) do
    Pleroma.Config.get([:rich_media, :ttl_setters])
    |> Enum.reduce({:ok, nil}, fn
      module, {:ok, _ttl} ->
        module.ttl(data, url)

      _, error ->
        error
    end)
  end

  defp parse_url(url) do
    opts =
      if Application.get_env(:tesla, :adapter) == Tesla.Adapter.Hackney do
        Keyword.merge(@options,
          recv_timeout: 2_000,
          with_body: true
        )
      else
        @options
      end

    try do
      {:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: opts)

      html
      |> parse_html()
      |> maybe_parse()
      |> Map.put("url", url)
      |> clean_parsed_data()
      |> check_parsed_data()
    rescue
      e ->
        {:error, "Parsing error: #{inspect(e)} #{inspect(__STACKTRACE__)}"}
    end
  end

  defp parse_html(html), do: Floki.parse_document!(html)

  defp maybe_parse(html) do
    Enum.reduce_while(parsers(), %{}, fn parser, acc ->
      case parser.parse(html, acc) do
        data when data != %{} -> {:halt, data}
        _ -> {:cont, acc}
      end
    end)
  end

  defp check_parsed_data(%{"title" => title} = data)
       when is_binary(title) and title != "" do
    {:ok, data}
  end

  defp check_parsed_data(data) do
    {:error, "Found metadata was invalid or incomplete: #{inspect(data)}"}
  end

  defp clean_parsed_data(data) do
    data
    |> Enum.reject(fn {key, val} ->
      not match?({:ok, _}, Jason.encode(%{key => val}))
    end)
    |> Map.new()
  end
end
rich media: parser: add copyright header 2019-01-28 20:59:36 +01:00			`# Pleroma: A lightweight social networking server`
Bump copyright years of files changed after 2020-01-07 Done via the following command: git diff fcd5dd259a1700a045be902b43391b0d1bd58a5b --stat --name-only \| xargs sed -i '/Pleroma Authors/c# Copyright © 2017-2020 Pleroma Authors <https:\/\/pleroma.social\/>' 2020-03-02 06:08:45 +01:00			`# Copyright © 2017-2020 Pleroma Authors <https://pleroma.social/>`
rich media: parser: add copyright header 2019-01-28 20:59:36 +01:00			`# SPDX-License-Identifier: AGPL-3.0-only`

Add OGP parser 2019-01-01 21:26:40 +01:00			`defmodule Pleroma.Web.RichMedia.Parser do`
adding gun adapter 2020-02-11 08:12:57 +01:00			`@options [`
rich media: tighten fetching timeouts and size limits 2019-02-10 22:37:51 +01:00			`pool: :media,`
adding gun adapter 2020-02-11 08:12:57 +01:00			`max_body: 2_000_000`
rich media: tighten fetching timeouts and size limits 2019-02-10 22:37:51 +01:00			`]`

parsers configurable 2019-07-11 15:04:42 +02:00			`defp parsers do`
			`Pleroma.Config.get([:rich_media, :parsers])`
			`end`

rich media: gracefully handle fetching nil URIs 2019-01-26 17:26:11 +01:00			`def parse(nil), do: {:error, "No URL provided"}`

Replace Mix.env with Pleroma.Config.get(:env) Mix.env/0 is not availible in release environments such as distillery or elixir's built-in releases. 2019-06-06 22:59:51 +02:00			`if Pleroma.Config.get(:env) == :test do`
rich media: disable cachex in test mode 2019-01-05 00:50:54 +01:00			`def parse(url), do: parse_url(url)`
			`else`
rich media: gracefully handle fetching nil URIs 2019-01-26 17:26:11 +01:00			`def parse(url) do`
rich media: parser: cache negatives 2019-01-28 21:19:07 +01:00			`try do`
			`Cachex.fetch!(:rich_media_cache, url, fn _ ->`
			`{:commit, parse_url(url)}`
			`end)`
add the rich media ttl based on image exp time 2019-07-16 18:52:36 +02:00			`\|> set_ttl_based_on_image(url)`
rich media: parser: cache negatives 2019-01-28 21:19:07 +01:00			`rescue`
			`e ->`
			`{:error, "Cachex error: #{inspect(e)}"}`
rich media: gracefully handle fetching nil URIs 2019-01-26 17:26:11 +01:00			`end`
			`end`
rich media: disable cachex in test mode 2019-01-05 00:50:54 +01:00			`end`
Add OGP parser 2019-01-01 21:26:40 +01:00
add the rich media ttl based on image exp time 2019-07-16 18:52:36 +02:00			`@doc """`
			`Set the rich media cache based on the expiration time of image.`

change the structure of image ttl parsar 2019-07-19 07:58:42 +02:00			Adopt behaviour `Pleroma.Web.RichMedia.Parser.TTL`
add the rich media ttl based on image exp time 2019-07-16 18:52:36 +02:00
			`## Example`

			`defmodule MyModule do`
change the structure of image ttl parsar 2019-07-19 07:58:42 +02:00			`@behaviour Pleroma.Web.RichMedia.Parser.TTL`
			`def ttl(data, url) do`
add the rich media ttl based on image exp time 2019-07-16 18:52:36 +02:00			`image_url = Map.get(data, :image)`
			`# do some parsing in the url and get the ttl of the image`
change the structure of image ttl parsar 2019-07-19 07:58:42 +02:00			`# and return ttl is unix time`
			`parse_ttl_from_url(image_url)`
add the rich media ttl based on image exp time 2019-07-16 18:52:36 +02:00			`end`
			`end`

			`Define the module in the config`

			`config :pleroma, :rich_media,`
			`ttl_setters: [MyModule]`
			`"""`
			`def set_ttl_based_on_image({:ok, data}, url) do`
Fix rich media parser failing when no TTL can be found by image TTL setters 2019-07-21 17:22:22 +02:00			`with {:ok, nil} <- Cachex.ttl(:rich_media_cache, url),`
			`ttl when is_number(ttl) <- get_ttl_from_image(data, url) do`
change the structure of image ttl parsar 2019-07-19 07:58:42 +02:00			`Cachex.expire_at(:rich_media_cache, url, ttl * 1000)`
			`{:ok, data}`
			`else`
add the rich media ttl based on image exp time 2019-07-16 18:52:36 +02:00			`_ ->`
			`{:ok, data}`
			`end`
			`end`

change the structure of image ttl parsar 2019-07-19 07:58:42 +02:00			`defp get_ttl_from_image(data, url) do`
			`Pleroma.Config.get([:rich_media, :ttl_setters])`
			`\|> Enum.reduce({:ok, nil}, fn`
			`module, {:ok, _ttl} ->`
			`module.ttl(data, url)`

			`_, error ->`
			`error`
			`end)`
			`end`
add the rich media ttl based on image exp time 2019-07-16 18:52:36 +02:00
rich media: disable cachex in test mode 2019-01-05 00:50:54 +01:00			`defp parse_url(url) do`
adding gun adapter 2020-02-11 08:12:57 +01:00			`opts =`
			`if Application.get_env(:tesla, :adapter) == Tesla.Adapter.Hackney do`
			`Keyword.merge(@options,`
			`recv_timeout: 2_000,`
			`with_body: true`
			`)`
			`else`
			`@options`
			`end`

rich media: add try/rescue to ensure we catch parsing and fetching failures 2019-01-27 13:21:05 +01:00			`try do`
adding gun adapter 2020-02-11 08:12:57 +01:00			`{:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: opts)`
rich media: use cachex to avoid flooding remote servers 2019-01-05 00:23:47 +01:00
Replace missing non-nullable Card attributes with empty strings 2019-05-30 23:03:31 +02:00			`html`
Use floki's new APIs for parsing fragments 2020-02-15 23:55:26 +01:00			`\|> parse_html()`
Replace missing non-nullable Card attributes with empty strings 2019-05-30 23:03:31 +02:00			`\|> maybe_parse()`
Fix atom leak in Rich Media Parser 2020-06-09 19:49:24 +02:00			`\|> Map.put("url", url)`
Replace missing non-nullable Card attributes with empty strings 2019-05-30 23:03:31 +02:00			`\|> clean_parsed_data()`
			`\|> check_parsed_data()`
rich media: add try/rescue to ensure we catch parsing and fetching failures 2019-01-27 13:21:05 +01:00			`rescue`
rich media: parser: cache negatives 2019-01-28 21:19:07 +01:00			`e ->`
Use floki's new APIs for parsing fragments 2020-02-15 23:55:26 +01:00			`{:error, "Parsing error: #{inspect(e)} #{inspect(__STACKTRACE__)}"}`
rich media: add try/rescue to ensure we catch parsing and fetching failures 2019-01-27 13:21:05 +01:00			`end`
Add RichMediaController and tests 2019-01-02 15:02:50 +01:00			`end`

Use floki's new APIs for parsing fragments 2020-02-15 23:55:26 +01:00			`defp parse_html(html), do: Floki.parse_document!(html)`
added prepare html for RichMedia.Parser 2019-09-15 13:53:58 +02:00
Add RichMediaController and tests 2019-01-02 15:02:50 +01:00			`defp maybe_parse(html) do`
parsers configurable 2019-07-11 15:04:42 +02:00			`Enum.reduce_while(parsers(), %{}, fn parser, acc ->`
Add OGP parser 2019-01-01 21:26:40 +01:00			`case parser.parse(html, acc) do`
Merge OGP parser with TwitterCard 2020-06-11 15:57:31 +02:00			`data when data != %{} -> {:halt, data}`
			`_ -> {:cont, acc}`
Add OGP parser 2019-01-01 21:26:40 +01:00			`end`
			`end)`
			`end`
Add RichMediaController and tests 2019-01-02 15:02:50 +01:00
Fix atom leak in Rich Media Parser 2020-06-09 19:49:24 +02:00			`defp check_parsed_data(%{"title" => title} = data)`
			`when is_binary(title) and title != "" do`
rich media: parser: add some basic sanity checks on the returned data with pattern matching 2019-01-28 21:31:43 +01:00			`{:ok, data}`
Add RichMediaController and tests 2019-01-02 15:02:50 +01:00			`end`

rich media: parser: reject OGP fields we cannot safely process 2019-01-31 17:03:56 +01:00			`defp check_parsed_data(data) do`
rich media: parser: add some basic sanity checks on the returned data with pattern matching 2019-01-28 21:31:43 +01:00			`{:error, "Found metadata was invalid or incomplete: #{inspect(data)}"}`
Add RichMediaController and tests 2019-01-02 15:02:50 +01:00			`end`
rich media: parser: reject OGP fields we cannot safely process 2019-01-31 17:03:56 +01:00
			`defp clean_parsed_data(data) do`
			`data`
rich media: parser: reject any data which cannot be explicitly encoded into JSON 2019-02-05 21:50:57 +01:00			`\|> Enum.reject(fn {key, val} ->`
Fix atom leak in Rich Media Parser 2020-06-09 19:49:24 +02:00			`not match?({:ok, _}, Jason.encode(%{key => val}))`
rich media: parser: reject OGP fields we cannot safely process 2019-01-31 17:03:56 +01:00			`end)`
			`\|> Map.new()`
			`end`
Add OGP parser 2019-01-01 21:26:40 +01:00			`end`