Why does the efficiency guide not mention how expensive exported functions can be for compilation times

If we convert the abstract code back to Erlang, the interesting part starts like this and goes on for 288K lines:

case_fn(_binary@1, _acc@1) ->
    case _binary@1 of
        <<",", _@1/binary>> -> case_fn(_@1, [<<",">> | _acc@1]);
        <<"0", _@2/binary>> -> case_fn(_@2, [<<"0">> | _acc@1]);
        <<"1", _@3/binary>> -> case_fn(_@3, [<<"1">> | _acc@1]);
        <<"2", _@4/binary>> -> case_fn(_@4, [<<"2">> | _acc@1]);
        <<"3", _@5/binary>> -> case_fn(_@5, [<<"3">> | _acc@1]);
        <<"4", _@6/binary>> -> case_fn(_@6, [<<"4">> | _acc@1]);
        <<"5", _@7/binary>> -> case_fn(_@7, [<<"5">> | _acc@1]);
        <<"6", _@8/binary>> -> case_fn(_@8, [<<"6">> | _acc@1]);
        <<"7", _@9/binary>> -> case_fn(_@9, [<<"7">> | _acc@1]);
        <<"8", _@10/binary>> ->
            case_fn(_@10, [<<"8">> | _acc@1]);
        <<"9", _@11/binary>> ->
            case_fn(_@11, [<<"9">> | _acc@1]);
        <<"A", _@12/binary>> ->
            case_fn(_@12, [<<"A">> | _acc@1]);
        <<"B", _@13/binary>> ->
            case_fn(_@13, [<<"B">> | _acc@1]);
        <<"C", _@14/binary>> ->
            case_fn(_@14, [<<"C">> | _acc@1]);
        <<"D", _@15/binary>> ->
            case_fn(_@15, [<<"D">> | _acc@1]);
        ......
2 Likes

With your provided link then it’s not entirely clear what your conclusion is, are you stating that there is a way to generate the code to compile faster? The essential part is that I need to make it recursive and without introducing overhead that would make it slower then a guard.

I think I got something working:

defmodule Test do
  require Unicode.Set
  case_ast =
    for c <- Unicode.Set.to_pattern!("[[:Lu:], [:Ll:], [:Lt:], [:Lm:], [:Lo:], [:Nl:], [:Mn:], [:Mc:], [:Nd:], [:Pc:], [:Cf:]]") do
        {:->, [],
         [
           [
             {:<<>>, [],
              [c, {:"::", [], [{:rest, [], Elixir}, {:binary, [], Elixir}]}]}
           ],
           {:case_fn, [],
            [{:rest, [], Elixir}, [{:|, [], [c, {:acc, [], Elixir}]}]]}
         ]}
    end ++ [{:->, [], [[{:_, [], Elixir}], {:acc, [], Elixir}]}]
    case_ast={:case, [],
     [
       {:binary, [], Elixir},
       [do: case_ast]
     ]}


  defmacro case_mac(binary, acc) do
    quote bind_quoted: [binary: binary, acc: acc] do
      unquote(case_ast)
    end
  end

  def case_fn(binary, acc) do
    quote bind_quoted: [binary: binary, acc: acc] do
      unquote(case_ast)
    end
  end

  def guard_fn(<<b::utf8, _::binary>>) when Unicode.Set.match?(b, "[[:Lu:], [:Ll:], [:Lt:], [:Lm:], [:Lo:], [:Nl:], [:Mn:], [:Mc:], [:Nd:], [:Pc:], [:Cf:]]"), do: true
  def guard_fn(_binary), do: false
end

Here is how I would solve this problem. It seems your goal is to parse a valid unicode SQL identifier. In this case, you don’t actually need to accumulate the parsed codepoints. You only need to count how many bytes the segment is, and then extract it from the binary later.

Mix.install([{:unicode_set, "~> 1.0"}])

defmodule QuiteFast do
  require Unicode.Set

  case_ast =
    for c <-
          Unicode.Set.to_pattern!(
            "[[:Lu:], [:Ll:], [:Lt:], [:Lm:], [:Lo:], [:Nl:], [:Mn:], [:Mc:], [:Nd:], [:Pc:], [:Cf:]]"
          ) do
      hd(
        quote do
          <<unquote(c), rest::binary>> -> unquote(byte_size(c)) + count_bytes(rest)
        end
      )
    end ++
      quote do
        _ -> 0
      end

  def case_fn(binary) do
    size = count_bytes(binary)
    <<pre::binary-size(^size), post::binary>> = binary
    {pre, post}
  end

  defp count_bytes(binary) do
    case binary do
      unquote(case_ast)
    end
  end
end

IO.inspect(Fast.case_fn("foo-bar"))

This version compiles ~2x slower than your no-op baseline because it does little work in each clause. It should also be faster at runtime than your Slow version, as it avoids building a list of binaries.

Finally, I still believe this is more of an algorithm/data-structure question than an issue with guards or particular constructs. Your problem statement is to identify more than 140k different codepoints and parse them. All solutions above (including mine) are brute-forcing it and in such cases, the less code you write and less data you build, it will likely be faster. If you want it to be even faster at runtime and compile-time, you may need to actually build tables with byte-ranges and similar to do optimized lookups.

1 Like

Thanks, I’m going to reduce the amount of patterns being generated with an order of magnitude and use some range guards, although I was hoping to be able to do guard-less programming, maybe we can do that some day in the future with improved hardware and compiler.

Here is another way to do the counting, resulting in smaller code:

Mix.install([{:unicode_set, "~> 1.0"}])

defmodule ProbablyFaster do
  require Unicode.Set

  case_ast =
    for s <-
          Unicode.Set.to_pattern!(
            "[[:Lu:], [:Ll:], [:Lt:], [:Lm:], [:Lo:], [:Nl:], [:Mn:], [:Mc:], [:Nd:], [:Pc:], [:Cf:]]"
              ) do
      <<c :: utf8>> = s
      hd(
        quote do
          unquote(c) -> true
        end
      )
    end ++
      quote do
        _ -> false
      end

  def case_fn(binary) do
    size = count_bytes(binary)
    <<pre::binary-size(^size), post::binary>> = binary
    {pre, post}
  end

  defp count_bytes(binary) do
    rest = do_count_bytes(binary)
    byte_size(binary) - byte_size(rest)
  end

  defp do_count_bytes(binary) do
    case binary do
      <<c :: utf8, rest :: binary>> ->
        accept? = case c do
                    unquote(case_ast)
                  end
        case accept? do
          true -> do_count_bytes(rest)
          false -> binary
        end
      _ ->
        binary
    end
  end
end

The resulting Erlang code will look this:

case_fn(_binary@1) ->
    _size@1 = count_bytes(_binary@1),
    <<_pre@1:_size@1/binary, _post@1/binary>> = _binary@1,
    {_pre@1, _post@1}.

count_bytes(_binary@1) ->
    _rest@1 = do_count_bytes(_binary@1),
    erlang:byte_size(_binary@1) - erlang:byte_size(_rest@1).

do_count_bytes(_binary@1) ->
    case _binary@1 of
        <<_c@1/utf8, _rest@1/binary>> ->
            _accept?@1 = case _c@1 of
                             44 -> true;
                             48 -> true;
                             49 -> true;
                             50 -> true;
                                  .
                                  .
                                  .
                             917996 -> true;
                             917997 -> true;
                             917998 -> true;
                             917999 -> true;
                             _ -> false
                         end,
            case _accept?@1 of
                true -> do_count_bytes(_rest@1);
                false -> _binary@1
            end;
        _ -> _binary@1
    end.

1 Like

I have found two bottlenecks in the compiler. One of them is only exposed when compiling from Erlang code (not from Elixir code), but the other one caused compilation of my ProbablyFaster module to take more than a minute to compile.

Here is the PR for the fix, which will be included in Erlang/OTP 28.2:

With the fix applied, ProbablyFasterwill compile in about 7 seconds.

10 Likes

Woot! Amazing. I could not see a compiler slow down on my case+byte_size version. So I am surprised yours does have one. Do you know why? Perhaps OTP versions (I am on 27)?

I took a look in the code, it’s at-least 2 years old, so this might go a bit back. Looking forward to try it out, who knows there might be other places like this that can be optimized. Great work @bjorng !

1 Like

Yes, I think I do. My code has more than 100000 clauses with identical bodies (true). That causes the bottleneck. Your code doesn’t have that many sequential clauses with the same body because binary matching is done one byte at the time. I think your code code will only have at most 256 sequential clauses with the same body.

Hi @bjorng,

Thanks for digging into this; always appreciate bug fixes and speed improvements

My code has more than 100000 clauses with identical bodies (true). That causes the bottleneck. Your code doesn’t have that many sequential clauses with the same body

Over at Elixir Forum this topic is also discussed, and I post benchmarks to validate claims. I could not benchmark a difference with the Elixir code I use.

defmodule Heads.A do
  def foo(x,y) when is_integer(x) and is_integer(y), do: do_foo(x,y)

  for i <- 1..200_000 do
    def do_foo(_x = unquote(i), _y = unquote(i)), do: unquote(i)
  end
end

Sample size: 200.000

Different body (as above)

test_defp.ex compiled in 145631.812 ms

Same body (-> true instead of -> unquote(i))

test_defp.ex compiled in 140443.401 ms

To me it seems a generalization to state “100.000 clauses with identical bodies (true) causes the issue”. And there is a nuance to it which is unknown to me. Am I right?

Edit: removed clutter from results

I think a thing to be aware of is the Erlang compiler internally transforms multiple function clauses to a case which is why the difference between the compile times is so small.

That part I know (and was expected). The bench script was used to proof that point.

I post these to show I see no difference to uni-body (true) and multi-body.

I don’t believe your benchmark are complex enough, the compiler have properly already optimized it away before it arrives to this part, if you look at the example above there is a bit more in the body then a simple integer or string.

Can you reproduce the slow down using this snippet? Why does the efficiency guide not mention how expensive exported functions can be for compilation times - #26 by bjorng

Consistent timings

elixir bjorn.exs     23.19s user     14.38s system     102% cpu     36.821 total
elixir jose.exs      15.23s user      2.56s system     100% cpu     17.694 total

User CPU time
The amount of CPU time spent running Elixir code in user mode (executing instructions in the Elixir runtime and BEAM VM, not kernel-level system calls)

System CPU time
The amount of CPU time spent in the operating system kernel on behalf of the process (file I/O, memory allocation).

1 Like

Right, I believe Bjorn’s code should compile as fast or faster than mine. I believe that’s the issue that was addressed. So his snippet is effectively reproducing it, if that makes any sense.

1 Like

So fast, it must be wrong. Build using regexes (split in sets to stay within regex chars limits) with an accumulator. Have to attend a birthday party; happy hacking!

also also: did not test runtime performance. It was just an experiment with José’s code that got outta hand.

bench_compile % time elixir bart.exs
{“foo”, “-bar”}
elixir bart.exs 0.71s user 0.48s system 136% cpu 0.866 total

Mix.install([{:unicode_set, "~> 1.0"}])

defmodule WithRegex do
  require Unicode.Set

  @chunk_size 10_000

  regex_sets =
    Unicode.Set.to_pattern!(
      "[[:Lu:], [:Ll:], [:Lt:], [:Lm:], [:Lo:], [:Nl:], [:Mn:], [:Mc:], [:Nd:], [:Pc:], [:Cf:]]"
    )
    |> Enum.chunk_every(@chunk_size)
    |> Enum.map(fn char_set ->
      ("[^" <> Enum.join(char_set, "|") <> "]")
      |> Regex.compile!()
      |> Macro.escape()
    end)

  def red(binary, acc \\ "", regexes \\ unquote(regex_sets))
  def red(binary, acc, []), do: {acc, binary}
  def red(binary, acc, [h | t]) do
    result = String.split(binary, h, include_captures: true, parts: 2)

    case result do
      ["", capture, post] ->
        red(capture <> post, acc, t)

      [pre, capture, post] ->
        red(capture <> post, pre, t)
        {capture <> post, pre}

      [_] ->
        red(binary, acc, t)
    end
  end
end

IO.inspect(WithRegex.red("foo-bar"))

Thats a fun exercise, but you don’t want use regex for a lexer, unless its a constrained version. Although pattern matching will be faster or reducing the number of clauses, it’s possible to reduce it with an order of magnitude and have some range guards as there is quite a lot of repeated bytes. Especially the four bytes UTF8 which all starts with 240.

I have zero knowledge of lexers, so I did expect the example not to survive real life. Sometimes it’s just hacking for the fun of it :slight_smile:

Would love to see a realistic set of benchmark inputs. But only when someone has it already available.