From 41b22efd8289976ac5b70cbdc118a84ea3619de6 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 12 Oct 2021 11:57:30 +0200
Subject: [PATCH 01/32] Benchmark with HPy 0.0.3 and PyPy3 7.3.6

---
 README.md                     | 63 +++++++++++++++++------------------
 bench/bench.jl                |  4 +--
 bench/bench_cpy_vs_hpy.py     | 57 ++++++++++++++++++++++++-------
 bench/make_bench_piconumpy.py |  9 ++---
 4 files changed, 83 insertions(+), 50 deletions(-)

diff --git a/README.md b/README.md
index a0bad3d..6b2a908 100644
--- a/README.md
+++ b/README.md
@@ -104,19 +104,13 @@ pypy -m pip install pip -U
 pypy -m pip install numpy cython pytest transonic pythran
 ```
 
-We need to install the correct version of HPy for the version of PyPy we are using:
+One can check which HPy version is vendored with PyPy:
 
 ```bash
 pypy -c "import hpy.universal as u; print(u.get_version())"
 ```
 
-gives `('0.0.2rc2.dev12+gc9660c2', 'c9660c2')`.
-
-```bash
-cd ~/Dev/hpy
-# update to the correct commit
-pypy setup.py develop
-```
+gives `('0.0.3', '2196f14')`.
 
 Now we can build-install PicoNumpy:
 
@@ -136,36 +130,36 @@ make
 
 ## Few results
 
-As of today (6 July 2021), HPy is not yet ready for high performance, but at
-least (with HPy 0.0.2) it runs !
+As of today (12 October 2021), HPy is not yet ready for high performance, but at
+least (with HPy 0.0.3) it runs !
 
 ### At home (Intel(R) Core(TM) i5-8400 CPU @ 2.80GHz)
 
 - With CPython
 
 ```
-Julia                      :     1 * norm = 0.00196 s
-PicoNumpy (CPython C-API)  :  9.42 * norm
-PicoNumpy (HPy CPy ABI)    :  9.95 * norm
-PicoNumpy (HPy Universal)  :  10.4 * norm
-Transonic-Pythran          : 0.497 * norm
-Numpy                      :  27.5 * norm
-PicoNumpy (purepy)         :  37.3 * norm
-PicoNumpy (purepy_array)   :  37.7 * norm
-PicoNumpy (Cython)         :  28.9 * norm
+Julia                      :     1 * norm = 0.0171 s
+PicoNumpy (CPython C-API)  :  11.1 * norm
+PicoNumpy (HPy CPy ABI)    :  11.6 * norm
+PicoNumpy (HPy Universal)  :  12.1 * norm
+Transonic-Pythran          : 0.537 * norm
+Numpy                      :  33.8 * norm
+PicoNumpy (purepy)         :  43.7 * norm
+PicoNumpy (purepy_array)   :  44.8 * norm
+PicoNumpy (Cython)         :  33.9 * norm
 ```
 
 - With PyPy3
 
 ```
-Julia                      :     1 * norm = 0.00196 s
-PicoNumpy (CPython C-API)  :  34.1 * norm
-PicoNumpy (HPy Universal)  :  12.8 * norm
-Transonic-Pythran          : 0.539 * norm
-Numpy                      :   232 * norm
-PicoNumpy (purepy)         :  4.39 * norm
-PicoNumpy (purepy_array)   :  6.33 * norm
-PicoNumpy (Cython)         :   274 * norm
+Julia                      :     1 * norm = 0.0171 s
+PicoNumpy (CPython C-API)  :  39.2 * norm
+PicoNumpy (HPy Universal)  :  13.1 * norm
+Transonic-Pythran          : 0.562 * norm
+Numpy                      :   286 * norm
+PicoNumpy (purepy)         :  5.59 * norm
+PicoNumpy (purepy_array)   :  7.41 * norm
+PicoNumpy (Cython)         :   282 * norm
 ```
 
 #### Simpler benchmarks (bench/bench_cpy_vs_hpy.py)
@@ -173,14 +167,19 @@ PicoNumpy (Cython)         :   274 * norm
 - With CPython
 
 ```
-CPython C-API:   1.92 seconds
-HPy [Universal]: 2.08 seconds
-HPy [CPy ABI]:   2.02 seconds
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=6, releaselevel='final', serial=0)}
+CPython C-API:   0.193 seconds (11.2 * Julia)
+HPy [Universal]: 0.208 seconds (12.1 * Julia)
+HPy [CPy ABI]:   0.201 seconds (11.7 * Julia)
 ```
 
 - With PyPy3
 
 ```
-CPython C-API:   5.75 seconds
-HPy [Universal]: 2.11 seconds
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=6, releaselevel='final', serial=0)}
+CPython C-API:   0.592 seconds (34.6 * Julia)
+HPy [Universal]: 0.207 seconds (12.1 * Julia)
+Python list:     0.093 seconds ( 5.4 * Julia)
 ```
diff --git a/bench/bench.jl b/bench/bench.jl
index 00cedff..c9d08ef 100644
--- a/bench/bench.jl
+++ b/bench/bench.jl
@@ -65,10 +65,10 @@ function bench(n_sleds, n_time)
 end
 
 
-n_sleds = 10
+n_sleds = 100
 n_time = 200
 
-nb_runs = 200
+nb_runs = 50
 
 times = zeros(nb_runs)
 
diff --git a/bench/bench_cpy_vs_hpy.py b/bench/bench_cpy_vs_hpy.py
index e54ad54..c97f3d8 100644
--- a/bench/bench_cpy_vs_hpy.py
+++ b/bench/bench_cpy_vs_hpy.py
@@ -1,8 +1,9 @@
 import sys
-import time
+from time import perf_counter
 import random
 from math import pi, cos, sin
 from pathlib import Path
+from pprint import pprint
 
 here = Path(__file__).absolute().parent
 
@@ -75,14 +76,18 @@ def bench(mod, n_sleds, n_time):
     u_init = mod.zeros(n_sleds)
     for i in range(n_sleds):
         u_init[i] += 3.5
-    start = time.time()
-    solver(mod, board, x_init, y_init, u_init, v_init, 0.01, n_time)
-    end = time.time()
-    return end - start
+    times = []
+    for _ in range(20):
+        start = perf_counter()
+        solver(mod, board, x_init, y_init, u_init, v_init, 0.01, n_time)
+        times.append(perf_counter() - start)
+
+    times.sort()
+    return times[len(times) // 2]
 
 
 N_SLEDS = 100
-N_TIME = 2000
+N_TIME = 200
 
 
 def import_piconumpy_hpy_universal():
@@ -97,18 +102,46 @@ def main():
 
     import piconumpy._piconumpy_cpython_capi as pnp_capi
 
-    t = bench(pnp_capi, N_SLEDS, N_TIME)
-    print(f"CPython C-API:   {t:.2f} seconds")
+    pprint({key: sys.implementation.__dict__[key] for key in ("cache_tag", "version")})
+
+    tmp_result_julia = Path("tmp_result_julia.txt")
+    if tmp_result_julia.exists():
+        with open("tmp_result_julia.txt") as file:
+            norm = float(file.read())
+        end = ""
+    else:
+        norm = False
+        end = "\n"
+
+    t_capi = bench(pnp_capi, N_SLEDS, N_TIME)
+    print(f"CPython C-API:   {t_capi:.3f} seconds", end=end)
+    if norm:
+        print(f" ({t_capi/norm:.1f} * Julia)")
 
     pnp_hpy_universal = import_piconumpy_hpy_universal()
-    t = bench(pnp_hpy_universal, N_SLEDS, N_TIME)
-    print(f"HPy [Universal]: {t:.2f} seconds")
+    t_hpy_univ = bench(pnp_hpy_universal, N_SLEDS, N_TIME)
+    print(f"HPy [Universal]: {t_hpy_univ:.3f} seconds", end=end)
+
+    if norm:
+        print(f" ({t_hpy_univ/norm:.1f} * Julia)")
 
     if not IS_PYPY:
         import piconumpy._piconumpy_hpy as pnp_hpy
 
-        t = bench(pnp_hpy, N_SLEDS, N_TIME)
-        print(f"HPy [CPy ABI]:   {t:.2f} seconds")
+        t_hpy_cpy_abi = bench(pnp_hpy, N_SLEDS, N_TIME)
+        print(f"HPy [CPy ABI]:   {t_hpy_cpy_abi:.3f} seconds", end=end)
+
+        if norm:
+            print(f" ({t_hpy_cpy_abi/norm:.1f} * Julia)")
+
+    if IS_PYPY:
+        import piconumpy.purepy as pnp_with_list
+
+        t_with_list = bench(pnp_with_list, N_SLEDS, N_TIME)
+        print(f"Python list:     {t_with_list:.3f} seconds", end=end)
+
+        if norm:
+            print(f" ({t_with_list/norm:4.1f} * Julia)")
 
 
 if __name__ == "__main__":
diff --git a/bench/make_bench_piconumpy.py b/bench/make_bench_piconumpy.py
index c15b3f6..c1a92d0 100644
--- a/bench/make_bench_piconumpy.py
+++ b/bench/make_bench_piconumpy.py
@@ -75,12 +75,12 @@ def create_tmp_file(name_module):
 name = fmt_name.format("Julia")
 print(f"{name}:     1 * norm = {norm:4.3g} s")
 
-n_sleds = 10
+n_sleds = 100
 n_time = 200
 
 g = locals()
 
-def timeit(name_func, name):
+def timeit(name_func, name, total_duration=2):
     return timeit_verbose(
         name_func + "(n_sleds, n_time)",
         globals=g,
@@ -88,6 +88,7 @@ def timeit(name_func, name):
         print_time=False,
         norm=norm,
         max_length_name=max_length_name,
+        total_duration=total_duration,
     )
 
 timeit("bench", name="PicoNumpy (CPython C-API)")
@@ -95,14 +96,14 @@ def timeit(name_func, name):
     timeit("bench_hpy", name="PicoNumpy (HPy CPy ABI)")
 timeit("bench_hpy_universal", name="PicoNumpy (HPy Universal)")
 timeit("bench_pythran", name="Transonic-Pythran")
-timeit("bench_numpy", name="Numpy")
+timeit("bench_numpy", name="Numpy", total_duration=4)
 timeit(
     "bench_piconumpy_purepy", name="PicoNumpy (purepy)",
 )
 timeit(
     "bench_piconumpy_purepy_array", name="PicoNumpy (purepy_array)",
 )
-timeit("bench_cython", name="PicoNumpy (Cython)")
+timeit("bench_cython", name="PicoNumpy (Cython)", total_duration=4)
 """
 )
 

From ba809f3dc55a249537a275013418f5c74d72b31c Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 12 Oct 2021 12:07:44 +0200
Subject: [PATCH 02/32] CI: use HPy 0.0.3

---
 .github/workflows/tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 8083adf..3385e2c 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -24,9 +24,8 @@ jobs:
 
     - name: Install dependencies
       run: |
-        git clone -b master --single-branch https://github.com/hpyproject/hpy
+        git clone -b release/0.0.3 --single-branch https://github.com/hpyproject/hpy
         cd hpy
-        git checkout 7b45ce522
         pip install .
         pip install numpy cython pytest transonic pythran
 
@@ -47,5 +46,6 @@ jobs:
     - name: Run bench
       run: |
         cd bench
+        make tmp_result_julia.txt
         make bench_hpy
         make

From 9e5aae731f104ab9c53a379649452a6e58cb7ed4 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 12 Oct 2021 13:56:15 +0200
Subject: [PATCH 03/32] CI: pypy-3.7-nightly

---
 .github/workflows/tests.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 3385e2c..7e1601d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,7 +8,7 @@ jobs:
     strategy:
       max-parallel: 5
       matrix:
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, pypy-3.7-nightly]
 
     steps:
 
@@ -22,11 +22,15 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
 
-    - name: Install dependencies
+    - if: startsWith(matrix.python-version, 'pypy') != true
+      name: Install HPy (only for CPython)
       run: |
         git clone -b release/0.0.3 --single-branch https://github.com/hpyproject/hpy
         cd hpy
         pip install .
+
+    - name: Install dependencies
+      run: |
         pip install numpy cython pytest transonic pythran
 
     - name: Checkout

From d3025c54652a73ce519a0db7921ca2dc15899399 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 12 Oct 2021 14:49:33 +0200
Subject: [PATCH 04/32] xfail 2 tests for PyPy

---
 piconumpy/test_cpython_capi.py  |  1 +
 piconumpy/test_hpy_universal.py | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/piconumpy/test_cpython_capi.py b/piconumpy/test_cpython_capi.py
index a1638dc..cedbed5 100644
--- a/piconumpy/test_cpython_capi.py
+++ b/piconumpy/test_cpython_capi.py
@@ -6,6 +6,7 @@
 
 class Tests:
     piconumpy = _piconumpy_cpython_capi
+
     def _array(self, *args):
         return self.piconumpy.array(*args)
 
diff --git a/piconumpy/test_hpy_universal.py b/piconumpy/test_hpy_universal.py
index fbf5ce4..2a470ca 100644
--- a/piconumpy/test_hpy_universal.py
+++ b/piconumpy/test_hpy_universal.py
@@ -1,3 +1,5 @@
+import sys
+
 import pytest
 
 from .util_hpy import import_ext
@@ -15,3 +17,15 @@
 )
 class TestsCPyABI(_Tests):
     piconumpy = piconumpy_universal
+
+    def test_multiply(self):
+        if sys.implementation.name == "pypy":
+            pytest.xfail("Expected failure with PyPy (but should work)")
+
+        super().test_multiply()
+
+    def test_add(self):
+        if sys.implementation.name == "pypy":
+            pytest.xfail("Expected failure with PyPy (but should work)")
+
+        super().test_add()

From 0b809bc2f51b3063782260851892aba46e1d97f3 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 12 Oct 2021 15:04:20 +0200
Subject: [PATCH 05/32] rm piconumpy/_piconumpy_hpy.py

---
 .github/workflows/tests.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 7e1601d..d989a0c 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -42,10 +42,11 @@ jobs:
       run: |
         python setup.py develop
         python setup.py --hpy-abi=universal develop
+        rm -f piconumpy/_piconumpy_hpy.py
 
     - name: Run tests
       run: |
-        pytest -s
+        pytest -v
 
     - name: Run bench
       run: |

From f788f1a04d01993c982681f58953ef9fad80219f Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 12 Oct 2021 15:22:11 +0200
Subject: [PATCH 06/32] Skip too long benchmarks

---
 bench/make_bench_piconumpy.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/bench/make_bench_piconumpy.py b/bench/make_bench_piconumpy.py
index c1a92d0..b02433f 100644
--- a/bench/make_bench_piconumpy.py
+++ b/bench/make_bench_piconumpy.py
@@ -96,14 +96,20 @@ def timeit(name_func, name, total_duration=2):
     timeit("bench_hpy", name="PicoNumpy (HPy CPy ABI)")
 timeit("bench_hpy_universal", name="PicoNumpy (HPy Universal)")
 timeit("bench_pythran", name="Transonic-Pythran")
-timeit("bench_numpy", name="Numpy", total_duration=4)
+try:
+    timeit("bench_numpy", name="Numpy", total_duration=8)
+except RuntimeError:
+    print("Skip bench_numpy because it's too slow")
 timeit(
     "bench_piconumpy_purepy", name="PicoNumpy (purepy)",
 )
 timeit(
     "bench_piconumpy_purepy_array", name="PicoNumpy (purepy_array)",
 )
-timeit("bench_cython", name="PicoNumpy (Cython)", total_duration=4)
+try:
+    timeit("bench_cython", name="PicoNumpy (Cython)", total_duration=8)
+except RuntimeError:
+    print("Skip bench_cython because it's too slow")
 """
 )
 

From 6f1d5119f79675f5b81d261721a4a9c2b18ee9e2 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 12 Oct 2021 15:25:49 +0200
Subject: [PATCH 07/32] rerun bench_hpy

---
 .github/workflows/tests.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index d989a0c..eff1152 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -54,3 +54,5 @@ jobs:
         make tmp_result_julia.txt
         make bench_hpy
         make
+        # let's rerun bench_hpy to get these results also at the end
+        make bench_hpy

From aa6f146860e02aa8a4a4ddb89ac4d5e0c7a78c1c Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 12 Oct 2021 16:03:28 +0200
Subject: [PATCH 08/32] Small improvements

---
 bench/bench.jl                | 2 +-
 bench/bench_cpy_vs_hpy.py     | 7 ++++---
 bench/make_bench_piconumpy.py | 3 +++
 bench/profile_piconumpy.py    | 2 ++
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/bench/bench.jl b/bench/bench.jl
index c9d08ef..bd98571 100644
--- a/bench/bench.jl
+++ b/bench/bench.jl
@@ -68,7 +68,7 @@ end
 n_sleds = 100
 n_time = 200
 
-nb_runs = 50
+nb_runs = 200
 
 times = zeros(nb_runs)
 
diff --git a/bench/bench_cpy_vs_hpy.py b/bench/bench_cpy_vs_hpy.py
index c97f3d8..38f1ba1 100644
--- a/bench/bench_cpy_vs_hpy.py
+++ b/bench/bench_cpy_vs_hpy.py
@@ -109,6 +109,7 @@ def main():
         with open("tmp_result_julia.txt") as file:
             norm = float(file.read())
         end = ""
+        print(f"Julia:           {norm:.3f} seconds")
     else:
         norm = False
         end = "\n"
@@ -116,14 +117,14 @@ def main():
     t_capi = bench(pnp_capi, N_SLEDS, N_TIME)
     print(f"CPython C-API:   {t_capi:.3f} seconds", end=end)
     if norm:
-        print(f" ({t_capi/norm:.1f} * Julia)")
+        print(f" ({t_capi/norm:4.1f} * Julia)")
 
     pnp_hpy_universal = import_piconumpy_hpy_universal()
     t_hpy_univ = bench(pnp_hpy_universal, N_SLEDS, N_TIME)
     print(f"HPy [Universal]: {t_hpy_univ:.3f} seconds", end=end)
 
     if norm:
-        print(f" ({t_hpy_univ/norm:.1f} * Julia)")
+        print(f" ({t_hpy_univ/norm:4.1f} * Julia)")
 
     if not IS_PYPY:
         import piconumpy._piconumpy_hpy as pnp_hpy
@@ -132,7 +133,7 @@ def main():
         print(f"HPy [CPy ABI]:   {t_hpy_cpy_abi:.3f} seconds", end=end)
 
         if norm:
-            print(f" ({t_hpy_cpy_abi/norm:.1f} * Julia)")
+            print(f" ({t_hpy_cpy_abi/norm:4.1f} * Julia)")
 
     if IS_PYPY:
         import piconumpy.purepy as pnp_with_list
diff --git a/bench/make_bench_piconumpy.py b/bench/make_bench_piconumpy.py
index b02433f..4a76e9a 100644
--- a/bench/make_bench_piconumpy.py
+++ b/bench/make_bench_piconumpy.py
@@ -47,6 +47,7 @@ def create_tmp_file(name_module):
 import numpy as np
 from piconumpy import array
 from math import pi, cos, sin
+from pprint import pprint
 
 IS_PYPY = hasattr(sys, 'pypy_version_info')
 """
@@ -65,6 +66,8 @@ def create_tmp_file(name_module):
 if not IS_PYPY:
     from tmp_hpy import bench as bench_hpy
 
+pprint({key: sys.implementation.__dict__[key] for key in ("cache_tag", "version")})
+
 # get norm from Julia benchmark
 with open("tmp_result_julia.txt") as file:
     norm = float(file.read())
diff --git a/bench/profile_piconumpy.py b/bench/profile_piconumpy.py
index b7de388..3bde5ae 100644
--- a/bench/profile_piconumpy.py
+++ b/bench/profile_piconumpy.py
@@ -7,12 +7,14 @@
 import tmp_purepy
 import tmp_purepy_array
 import tmp_cython
+import tmp_hpy_universal
 
 methods = {
     "cpython-c-api": bench_array1d,
     "purepy": tmp_purepy,
     "purepy_array": tmp_purepy_array,
     "cython": tmp_cython,
+    "universal": tmp_hpy_universal,
 }
 
 module = methods.get(sys.argv[-1], bench_array1d)

From 968cc4362682564b08c7a269425f17679006bcb3 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Wed, 13 Oct 2021 22:35:18 +0200
Subject: [PATCH 09/32] microbench loop_sum

---
 bench/microbench_loop_sum/Makefile  | 13 +++++++
 bench/microbench_loop_sum/README.md | 35 +++++++++++++++++++
 bench/microbench_loop_sum/bench.jl  | 21 ++++++++++++
 bench/microbench_loop_sum/bench.py  | 53 +++++++++++++++++++++++++++++
 4 files changed, 122 insertions(+)
 create mode 100644 bench/microbench_loop_sum/Makefile
 create mode 100644 bench/microbench_loop_sum/README.md
 create mode 100644 bench/microbench_loop_sum/bench.jl
 create mode 100644 bench/microbench_loop_sum/bench.py

diff --git a/bench/microbench_loop_sum/Makefile b/bench/microbench_loop_sum/Makefile
new file mode 100644
index 0000000..a4259be
--- /dev/null
+++ b/bench/microbench_loop_sum/Makefile
@@ -0,0 +1,13 @@
+
+all: tmp_result_julia.txt
+	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
+	@python bench.py list
+	@python bench.py purepy
+	@python bench.py _piconumpy_hpy
+	@python bench.py _piconumpy_cpython_capi
+
+tmp_result_julia.txt: bench.jl
+	@julia bench.jl > tmp_result_julia.txt
+
+clean:
+	rm -f tmp_result_julia.txt
\ No newline at end of file
diff --git a/bench/microbench_loop_sum/README.md b/bench/microbench_loop_sum/README.md
new file mode 100644
index 0000000..93aa9d6
--- /dev/null
+++ b/bench/microbench_loop_sum/README.md
@@ -0,0 +1,35 @@
+# Microbenchmark sum_loop
+
+We measure the performance for this function:
+
+```python
+def sum_loop(arr):
+    result = 0.0
+    for value in arr:
+        result += value
+    return result
+```
+
+One can run the benchmarks with `make`.
+
+With PyPy3.7, I get:
+
+```
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=6, releaselevel='final', serial=0)}
+list                          : 1.75e-05 s (  1.6 * Julia)
+purepy                        : 1.95e-05 s (  1.8 * Julia)
+_piconumpy_hpy                : 2.18e-04 s ( 20.5 * Julia)
+_piconumpy_cpython_capi       : 1.19e-03 s (112.1 * Julia)
+```
+
+With CPython:
+
+```
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=6, releaselevel='final', serial=0)}
+list                          : 2.65e-04 s ( 25.0 * Julia)
+purepy                        : 1.27e-03 s (120.0 * Julia)
+_piconumpy_hpy                : 4.24e-04 s ( 39.9 * Julia)
+_piconumpy_cpython_capi       : 3.50e-04 s ( 33.0 * Julia)
+```
\ No newline at end of file
diff --git a/bench/microbench_loop_sum/bench.jl b/bench/microbench_loop_sum/bench.jl
new file mode 100644
index 0000000..440c755
--- /dev/null
+++ b/bench/microbench_loop_sum/bench.jl
@@ -0,0 +1,21 @@
+using Statistics
+
+function sum_loop(arr)
+    result = 0.
+    for i in eachindex(arr)
+        result += arr[i]
+    end
+    return result
+end
+
+size = 10000
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed sum_loop(arr)
+end
+
+println(median(times))
diff --git a/bench/microbench_loop_sum/bench.py b/bench/microbench_loop_sum/bench.py
new file mode 100644
index 0000000..03f5040
--- /dev/null
+++ b/bench/microbench_loop_sum/bench.py
@@ -0,0 +1,53 @@
+import sys
+from time import perf_counter
+from pathlib import Path
+from random import random
+
+import numpy as np
+
+tmp_result_julia = Path("tmp_result_julia.txt")
+if tmp_result_julia.exists():
+    with open("tmp_result_julia.txt") as file:
+        norm = float(file.read())
+else:
+    print("tmp_result_julia.txt does not exist. First execute with `make`")
+
+try:
+    method = sys.argv[1]
+except IndexError:
+    method = "purepy"
+
+
+def sum_loop(arr):
+    result = 0.0
+    for value in arr:
+        result += value
+    return result
+
+
+if method == "_piconumpy_hpy":
+    from piconumpy.util_hpy import import_ext
+
+    ext = import_ext()
+    array = ext.array
+elif method == "list":
+    array = list
+else:
+    d = {}
+    exec(f"from piconumpy.{method} import array", d)
+    array = d["array"]
+
+# print(array)
+
+size = 10000
+times = []
+nb_runs = 200
+for _ in range(nb_runs):
+    data_as_list = [random() for _ in range(size)]
+    arr = array(data_as_list)
+    t_start = perf_counter()
+    sum_loop(arr)
+    times.append(perf_counter() - t_start)
+
+time = np.median(times)
+print(f"{method:30s}: {time:.2e} s ({time / norm:5.1f} * Julia)")

From 20fbba36a1f5bedcb8f21e64692025a7c71d9269 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Wed, 13 Oct 2021 22:52:52 +0200
Subject: [PATCH 10/32] microbench_loop_sum: add numpy

---
 bench/microbench_loop_sum/Makefile  |  1 +
 bench/microbench_loop_sum/README.md | 16 +++++++++-------
 bench/microbench_loop_sum/bench.py  |  2 ++
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/bench/microbench_loop_sum/Makefile b/bench/microbench_loop_sum/Makefile
index a4259be..a54982e 100644
--- a/bench/microbench_loop_sum/Makefile
+++ b/bench/microbench_loop_sum/Makefile
@@ -3,6 +3,7 @@ all: tmp_result_julia.txt
 	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
 	@python bench.py list
 	@python bench.py purepy
+	@python bench.py numpy
 	@python bench.py _piconumpy_hpy
 	@python bench.py _piconumpy_cpython_capi
 
diff --git a/bench/microbench_loop_sum/README.md b/bench/microbench_loop_sum/README.md
index 93aa9d6..0b4cc89 100644
--- a/bench/microbench_loop_sum/README.md
+++ b/bench/microbench_loop_sum/README.md
@@ -17,9 +17,10 @@ With PyPy3.7, I get:
 ```
 {'cache_tag': 'pypy37',
  'version': sys.pypy_version_info(major=7, minor=3, micro=6, releaselevel='final', serial=0)}
-list                          : 1.75e-05 s (  1.6 * Julia)
-purepy                        : 1.95e-05 s (  1.8 * Julia)
-_piconumpy_hpy                : 2.18e-04 s ( 20.5 * Julia)
+list                          : 1.73e-05 s (  1.6 * Julia)
+purepy                        : 1.97e-05 s (  1.9 * Julia)
+numpy                         : 4.12e-03 s (388.6 * Julia)
+_piconumpy_hpy                : 2.14e-04 s ( 20.2 * Julia)
 _piconumpy_cpython_capi       : 1.19e-03 s (112.1 * Julia)
 ```
 
@@ -28,8 +29,9 @@ With CPython:
 ```
 {'cache_tag': 'cpython-39',
  'version': sys.version_info(major=3, minor=9, micro=6, releaselevel='final', serial=0)}
-list                          : 2.65e-04 s ( 25.0 * Julia)
-purepy                        : 1.27e-03 s (120.0 * Julia)
-_piconumpy_hpy                : 4.24e-04 s ( 39.9 * Julia)
-_piconumpy_cpython_capi       : 3.50e-04 s ( 33.0 * Julia)
+list                          : 2.62e-04 s ( 24.7 * Julia)
+purepy                        : 1.25e-03 s (118.2 * Julia)
+numpy                         : 8.66e-04 s ( 81.6 * Julia)
+_piconumpy_hpy                : 4.22e-04 s ( 39.8 * Julia)
+_piconumpy_cpython_capi       : 3.53e-04 s ( 33.3 * Julia)
 ```
\ No newline at end of file
diff --git a/bench/microbench_loop_sum/bench.py b/bench/microbench_loop_sum/bench.py
index 03f5040..db6e6fc 100644
--- a/bench/microbench_loop_sum/bench.py
+++ b/bench/microbench_loop_sum/bench.py
@@ -32,6 +32,8 @@ def sum_loop(arr):
     array = ext.array
 elif method == "list":
     array = list
+elif method == "numpy":
+    array = np.array
 else:
     d = {}
     exec(f"from piconumpy.{method} import array", d)

From 47ebd2e0cc4124cd108f1b387efd1f6766bccb17 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Fri, 12 Nov 2021 15:04:29 +0100
Subject: [PATCH 11/32] Update bench + PyPy traces

---
 bench/microbench_loop_sum/Makefile  |   7 +-
 bench/microbench_loop_sum/README.md | 166 ++++++++++++++++++++++++++--
 bench/microbench_loop_sum/bench.py  |   9 +-
 3 files changed, 169 insertions(+), 13 deletions(-)

diff --git a/bench/microbench_loop_sum/Makefile b/bench/microbench_loop_sum/Makefile
index a54982e..e394385 100644
--- a/bench/microbench_loop_sum/Makefile
+++ b/bench/microbench_loop_sum/Makefile
@@ -11,4 +11,9 @@ tmp_result_julia.txt: bench.jl
 	@julia bench.jl > tmp_result_julia.txt
 
 clean:
-	rm -f tmp_result_julia.txt
\ No newline at end of file
+	rm -f tmp_*.txt
+
+produce_traces: tmp_result_julia.txt
+	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp_traces_list.txt pypy bench.py list
+	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp_traces_piconumpy_list.txt pypy bench.py purepy
+	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp_traces_piconumpy_hpy.txt pypy bench.py _piconumpy_hpy
diff --git a/bench/microbench_loop_sum/README.md b/bench/microbench_loop_sum/README.md
index 0b4cc89..dade95a 100644
--- a/bench/microbench_loop_sum/README.md
+++ b/bench/microbench_loop_sum/README.md
@@ -17,11 +17,11 @@ With PyPy3.7, I get:
 ```
 {'cache_tag': 'pypy37',
  'version': sys.pypy_version_info(major=7, minor=3, micro=6, releaselevel='final', serial=0)}
-list                          : 1.73e-05 s (  1.6 * Julia)
-purepy                        : 1.97e-05 s (  1.9 * Julia)
-numpy                         : 4.12e-03 s (388.6 * Julia)
-_piconumpy_hpy                : 2.14e-04 s ( 20.2 * Julia)
-_piconumpy_cpython_capi       : 1.19e-03 s (112.1 * Julia)
+list                          : 1.34e-05 s (  1.3 * Julia)
+piconumpy.purepy              : 1.33e-05 s (  1.3 * Julia)
+numpy                         : 4.00e-03 s (376.6 * Julia)
+_piconumpy_hpy                : 1.99e-04 s ( 18.8 * Julia)
+_piconumpy_cpython_capi       : 1.27e-03 s (119.5 * Julia)
 ```
 
 With CPython:
@@ -29,9 +29,153 @@ With CPython:
 ```
 {'cache_tag': 'cpython-39',
  'version': sys.version_info(major=3, minor=9, micro=6, releaselevel='final', serial=0)}
-list                          : 2.62e-04 s ( 24.7 * Julia)
-purepy                        : 1.25e-03 s (118.2 * Julia)
-numpy                         : 8.66e-04 s ( 81.6 * Julia)
-_piconumpy_hpy                : 4.22e-04 s ( 39.8 * Julia)
-_piconumpy_cpython_capi       : 3.53e-04 s ( 33.3 * Julia)
-```
\ No newline at end of file
+list                          : 2.62e-04 s ( 24.6 * Julia)
+piconumpy.purepy              : 1.25e-03 s (117.5 * Julia)
+numpy                         : 7.35e-04 s ( 69.2 * Julia)
+_piconumpy_hpy                : 4.26e-04 s ( 40.2 * Julia)
+_piconumpy_cpython_capi       : 3.52e-04 s ( 33.1 * Julia)
+```
+
+- PyPy is fast with list (1.3 * Julia, same order of magnitude that with Julia)
+and as fast for a piconumpy array based on a list ("piconumpy.purepy", zero
+cost abstraction!)
+
+- Numpy and _piconumpy_cpython_capi are both much slower with PyPy than with
+Cpython. We can guess that the Numpy port to HPy would fix that.
+
+- piconumpy_hpy is a bit faster with PyPy (19 * Julia) than with CPython (40 *
+Julia), however, we see that PyPy does not strongly accelerate piconumpy_hpy
+(19 * Julia, 14 * piconumpy_list).
+
+## Traces PyPy `sum_loop`
+
+### List
+
+```
++557: label(p0, p1, p6, p9, f35, f30, p15, p22, p26, i32, i27, p29, descr=TargetToken(140447503809120))
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++606: i44 = uint_ge(i32, i27)
+guard_false(i44, descr=<Guard0x7fbc7b939a00>) [p0, p6, p9, p15, p1, i32, i27, i44, p26, f30, f35]
++615: f45 = getarrayitem_gc_f(p29, i32, descr=<ArrayF 8>)
++622: i47 = int_add(i32, 1)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#12 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#14 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#16 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#18 INPLACE_ADD')
++626: f48 = float_add(f35, f45)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#20 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#22 JUMP_ABSOLUTE')
++630: setfield_gc(p15, i47, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
++634: guard_not_invalidated(descr=<Guard0x7fbc7b939a60>) [p0, p6, p9, p15, p1, f45, f48, None, None]
++634: i51 = getfield_raw_i(140447672379264, descr=<FieldS pypysig_long_struct.c_value 0>)
++647: i53 = int_sub(i51, 1)
++651: setfield_raw(140447672379264, i53, descr=<FieldS pypysig_long_struct.c_value 0>)
++654: i56 = int_lt(i53, 0)
++658: guard_false(i56, descr=<Guard0x7fbc7b939ac0>) [p0, p6, p9, p15, p1, i53, f45, f48, None, None]
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++664: i57 = arraylen_gc(p29, descr=<ArrayF 8>)
++664: jump(p0, p1, p6, p9, f48, f45, p15, p22, p26, i47, i27, p29, descr=TargetToken(140447503809120))
+```
+
+### piconumpy purepy (based on list)
+
+```
++705: label(p0, p1, p6, p9, f53, f46, p15, p22, i49, p29, p38, p42, i43, p45, descr=TargetToken(139748702723776))
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++760: guard_not_invalidated(descr=<Guard0x7f19c7c97d60>) [p0, p6, p9, p15, p1, p22, i49, f46, f53]
++760: p62 = force_token()
++760: enter_portal_frame(21, 28364)
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#0 LOAD_FAST')
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#2 LOAD_ATTR')
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#4 LOAD_FAST')
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#6 BINARY_SUBSCR')
++760: i65 = uint_ge(i49, i43)
++763: guard_false(i65, descr=<Guard0x7f19ba0b44a0>) [p0, p6, p9, p15, p1, p22, i49, f46, f53]
++769: f66 = getarrayitem_gc_f(p45, i49, descr=<ArrayF 8>)
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#8 RETURN_VALUE')
++776: leave_portal_frame(21)
++776: i69 = int_add(i49, 1)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#12 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#14 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#16 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#18 INPLACE_ADD')
++780: f70 = float_add(f53, f66)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#20 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#22 JUMP_ABSOLUTE')
++784: i72 = getfield_raw_i(139748871243648, descr=<FieldS pypysig_long_struct.c_value 0>)
++797: i74 = int_sub(i72, 3)
++801: setfield_raw(139748871243648, i74, descr=<FieldS pypysig_long_struct.c_value 0>)
++804: setfield_gc(p15, i69, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
++808: i77 = int_lt(i74, 0)
++812: guard_false(i77, descr=<Guard0x7f19c7c97dc0>) [p0, p6, p9, p15, p1, i74, f66, f70, None, None, None]
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++818: i78 = arraylen_gc(p45, descr=<ArrayF 8>)
++818: jump(p0, p1, p6, p9, f70, f66, p15, p22, i69, p29, p38, p42, i43, p45, descr=TargetToken(139748702723776))
+```
+
+### piconumpy hpy
+
+```
++1339: label(p0, p1, p6, p9, f73, p63, p15, i68, p62, descr=TargetToken(139865876151520))
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++1352: p82 = getfield_gc_r(p15, descr=<FieldP pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_w_seq 16>)
++1356: guard_nonnull_class(p82, 139866025815200, descr=<Guard0x7f350fde1a60>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1376: p84 = getfield_gc_r(p82, descr=<FieldP pypy.interpreter.typedef.W_HPyObjectUserDictWeakrefable.inst_map 16>)
++1387: guard_value(p84, ConstPtr(ptr85), descr=<Guard0x7f35021fe0b0>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1396: guard_not_invalidated(descr=<Guard0x7f35021fe0f8>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1403: p87 = getfield_gc_r(ConstPtr(ptr86), descr=<FieldP pypy.module._hpy_universal.interp_slot.W_SlotWrapper.inst_w_objclass 32>)
++1414: guard_value(p87, ConstPtr(ptr88), descr=<Guard0x7f350fde1ac0>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1423: i90 = getfield_gc_i(ConstPtr(ptr89), descr=<FieldU pypy.module._hpy_universal.interp_slot.W_SlotWrapper.inst_cfuncptr 8>)
++1427: i92 = int_lt(i68, 0)
++1431: guard_false(i92, descr=<Guard0x7f35021fe140>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1444: i94 = getfield_gc_i(ConstPtr(ptr93), descr=<FieldS list.length 8>)
++1448: i95 = int_is_zero(i94)
++1451: guard_false(i95, descr=<Guard0x7f35021fe188>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1457: i97 = int_sub(i94, 1)
++1461: p99 = getfield_gc_r(ConstPtr(ptr98), descr=<FieldP list.items 16>)
++1465: i100 = getarrayitem_gc_i(p99, i97, descr=<ArrayS 8>)
++1470: i101 = arraylen_gc(p99, descr=<ArrayS 8>)
++1474: i103 = int_rshift(i101, 1)
++1477: i105 = int_sub(i103, 5)
++1481: i106 = int_lt(i97, i105)
++1484: cond_call(i106, ConstClass(_ll_list_resize_hint_really_look_inside_iff__listPtr_Signed_Bool), ConstPtr(ptr108), i97, 0, descr=<Callv 0 rii EF=5>)
++1490: guard_no_exception(descr=<Guard0x7f350fde1b20>) [p0, p6, p9, p63, p15, p1, i68, i90, i100, p82, i97, f73]
++1490: setfield_gc(ConstPtr(ptr110), i97, descr=<FieldS list.length 8>)
++1494: i112 = int_lt(i100, 0)
++1498: guard_false(i112, descr=<Guard0x7f350fde1b80>) [p0, p6, p9, p63, p15, p1, i68, i90, i100, p82, f73]
++1522: setarrayitem_gc(p62, i100, p82, descr=<ArrayP 8>)
++1527: p113 = force_token()
++1548: setfield_gc(p0, p113, descr=<FieldP pypy.interpreter.pyframe.PyFrame.vable_token 8>)
++1552: i115 = call_may_force_i(i90, 139866044538144, i100, i68, descr=<Calli 8 iii EF=7>)
++1663: guard_not_forced(descr=<Guard0x7f350fdfabe8>) [p0, p6, p9, p63, p15, p1, i100, i115, i68, p82, f73]
++1674: guard_no_exception(descr=<Guard0x7f35021fe1d0>) [p0, p6, p9, p63, p15, p1, i100, i115, i68, p82, f73]
++1688: call_n(ConstClass(close), i100, descr=<Callv 0 i EF=5>)
++1754: guard_no_exception(descr=<Guard0x7f350fde1be0>) [p0, p6, p9, p63, p15, p1, i115, i68, p82, f73]
++1768: i117 = int_is_true(i115)
++1771: guard_true(i117, descr=<Guard0x7f35021fe218>) [p0, p6, p9, p63, p15, p1, i115, i68, p82, f73]
++1784: p119 = getfield_gc_r(ConstPtr(ptr118), descr=<FieldP list.items 16>)
++1788: p120 = getarrayitem_gc_r(p119, i115, descr=<ArrayP 8>)
++1793: call_n(ConstClass(close), i115, descr=<Callv 0 i EF=5>)
++1866: guard_no_exception(descr=<Guard0x7f350fde1c40>) [p0, p6, p9, p63, p15, p1, p120, i68, p82, f73]
++1880: guard_nonnull_class(p120, ConstClass(W_FloatObject), descr=<Guard0x7f350fde1ca0>) [p0, p6, p9, p63, p15, p1, p120, i68, p82, f73]
++1907: i123 = getfield_gc_i(p15, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
++1918: i125 = int_add(i123, 1)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#12 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#14 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#16 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#18 INPLACE_ADD')
++1923: setfield_gc(p15, i125, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
++1927: f126 = getfield_gc_f(p120, descr=<FieldF pypy.objspace.std.floatobject.W_FloatObject.inst_floatval 8 pure>)
++1933: f127 = float_add(f73, f126)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#20 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#22 JUMP_ABSOLUTE')
++1947: guard_not_invalidated(descr=<Guard0x7f350fde1d00>) [p0, p6, p9, p120, p15, p1, f127, None, None, None]
++1947: i129 = getfield_raw_i(139866044675968, descr=<FieldS pypysig_long_struct.c_value 0>)
++1960: i131 = int_sub(i129, 3)
++1964: setfield_raw(139866044675968, i131, descr=<FieldS pypysig_long_struct.c_value 0>)
++1967: i134 = int_lt(i131, 0)
++1971: guard_false(i134, descr=<Guard0x7f350fde1d60>) [p0, p6, p9, p120, p15, p1, i131, f127, None, None, None]
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++1977: i135 = arraylen_gc(p119, descr=<ArrayP 8>)
++1977: jump(p0, p1, p6, p9, f127, p120, p15, i125, p119, descr=TargetToken(139865876151520))
+```
diff --git a/bench/microbench_loop_sum/bench.py b/bench/microbench_loop_sum/bench.py
index db6e6fc..51c725d 100644
--- a/bench/microbench_loop_sum/bench.py
+++ b/bench/microbench_loop_sum/bench.py
@@ -5,6 +5,8 @@
 
 import numpy as np
 
+on_pypy = sys.implementation.name == 'pypy'
+
 tmp_result_julia = Path("tmp_result_julia.txt")
 if tmp_result_julia.exists():
     with open("tmp_result_julia.txt") as file:
@@ -24,6 +26,10 @@ def sum_loop(arr):
         result += value
     return result
 
+nb_runs = 500
+if on_pypy and method in ["list", "_piconumpy_hpy", "purepy"]:
+    nb_runs = 5000
+
 
 if method == "_piconumpy_hpy":
     from piconumpy.util_hpy import import_ext
@@ -38,12 +44,13 @@ def sum_loop(arr):
     d = {}
     exec(f"from piconumpy.{method} import array", d)
     array = d["array"]
+    if "piconumpy" not in method:
+        method = f"piconumpy.{method}"
 
 # print(array)
 
 size = 10000
 times = []
-nb_runs = 200
 for _ in range(nb_runs):
     data_as_list = [random() for _ in range(size)]
     arr = array(data_as_list)

From 431abefb83852446241e4f44fbed45ef2d62445f Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Wed, 8 Dec 2021 17:19:40 +0100
Subject: [PATCH 12/32] Add more simple low level benchmarks

---
 Makefile                                      |  1 +
 bench/microbench_loop_sum/Makefile            | 39 ++++++++++-
 bench/microbench_loop_sum/bench.jl            |  4 +-
 bench/microbench_loop_sum/bench.py            | 66 ++++++++++++++++---
 bench/microbench_loop_sum/bench_cort.jl       | 35 ++++++++++
 bench/microbench_loop_sum/bench_init_zeros.jl | 21 ++++++
 .../bench_sum_loop_index.jl                   | 23 +++++++
 7 files changed, 177 insertions(+), 12 deletions(-)
 create mode 100644 bench/microbench_loop_sum/bench_cort.jl
 create mode 100644 bench/microbench_loop_sum/bench_init_zeros.jl
 create mode 100644 bench/microbench_loop_sum/bench_sum_loop_index.jl

diff --git a/Makefile b/Makefile
index f7f7c54..3ed0ded 100644
--- a/Makefile
+++ b/Makefile
@@ -24,6 +24,7 @@ build_ext_universal:
 
 build_ext:
 	$(PYTHON) setup.py build_ext -if
+	rm -f piconumpy/_piconumpy_hpy.py
 
 full:
 	$(PYTHON) -m pip install -e .[full]
diff --git a/bench/microbench_loop_sum/Makefile b/bench/microbench_loop_sum/Makefile
index e394385..c8f534d 100644
--- a/bench/microbench_loop_sum/Makefile
+++ b/bench/microbench_loop_sum/Makefile
@@ -1,5 +1,5 @@
 
-all: tmp_result_julia.txt
+bench: tmp_julia_sum_loop.txt
 	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
 	@python bench.py list
 	@python bench.py purepy
@@ -7,8 +7,41 @@ all: tmp_result_julia.txt
 	@python bench.py _piconumpy_hpy
 	@python bench.py _piconumpy_cpython_capi
 
-tmp_result_julia.txt: bench.jl
-	@julia bench.jl > tmp_result_julia.txt
+bench_sum_loop_index: tmp_julia_sum_loop_index.txt
+	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
+	@python bench.py list sum_loop_index
+	@python bench.py purepy sum_loop_index
+	@python bench.py numpy sum_loop_index
+	@python bench.py _piconumpy_hpy sum_loop_index
+	@python bench.py _piconumpy_cpython_capi sum_loop_index
+
+bench_cort: tmp_julia_cort.txt
+	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
+	@python bench.py list cort
+	@python bench.py purepy cort
+	@python bench.py numpy cort
+	@python bench.py _piconumpy_hpy cort
+	@python bench.py _piconumpy_cpython_capi cort
+
+bench_init_zeros: tmp_julia_init_zeros.txt
+	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
+	@python bench.py list init_zeros
+	@python bench.py purepy init_zeros
+	@python bench.py numpy init_zeros
+	@python bench.py _piconumpy_hpy init_zeros
+	@python bench.py _piconumpy_cpython_capi init_zeros
+
+tmp_julia_sum_loop.txt: bench.jl
+	@julia bench.jl > tmp_julia_sum_loop.txt
+
+tmp_julia_sum_loop_index.txt: bench_sum_loop_index.jl
+	@julia bench_sum_loop_index.jl > tmp_julia_sum_loop_index.txt
+
+tmp_julia_cort.txt: bench_cort.jl
+	@julia bench_cort.jl > tmp_julia_cort.txt
+
+tmp_julia_init_zeros.txt: bench_init_zeros.jl
+	@julia bench_init_zeros.jl > tmp_julia_init_zeros.txt
 
 clean:
 	rm -f tmp_*.txt
diff --git a/bench/microbench_loop_sum/bench.jl b/bench/microbench_loop_sum/bench.jl
index 440c755..5c38b52 100644
--- a/bench/microbench_loop_sum/bench.jl
+++ b/bench/microbench_loop_sum/bench.jl
@@ -8,6 +8,8 @@ function sum_loop(arr)
     return result
 end
 
+compute_from_arr = sum_loop
+
 size = 10000
 nb_runs = 200
 
@@ -15,7 +17,7 @@ times = zeros(nb_runs)
 
 for irun in 1:nb_runs
     arr = rand(size)
-    times[irun] = @elapsed sum_loop(arr)
+    times[irun] = @elapsed compute_from_arr(arr)
 end
 
 println(median(times))
diff --git a/bench/microbench_loop_sum/bench.py b/bench/microbench_loop_sum/bench.py
index 51c725d..321fe5e 100644
--- a/bench/microbench_loop_sum/bench.py
+++ b/bench/microbench_loop_sum/bench.py
@@ -2,23 +2,31 @@
 from time import perf_counter
 from pathlib import Path
 from random import random
+from math import sqrt
 
 import numpy as np
 
 on_pypy = sys.implementation.name == 'pypy'
 
-tmp_result_julia = Path("tmp_result_julia.txt")
-if tmp_result_julia.exists():
-    with open("tmp_result_julia.txt") as file:
-        norm = float(file.read())
-else:
-    print("tmp_result_julia.txt does not exist. First execute with `make`")
-
 try:
     method = sys.argv[1]
 except IndexError:
     method = "purepy"
 
+try:
+    name_bench = sys.argv[2]
+except IndexError:
+    name_bench = "sum_loop"
+
+
+tmp_result_julia = Path(f"tmp_julia_{name_bench}.txt")
+if tmp_result_julia.exists():
+    with open(tmp_result_julia) as file:
+        norm = float(file.read())
+else:
+    print(f"{tmp_result_julia} does not exist. First execute with `make`")
+
+
 
 def sum_loop(arr):
     result = 0.0
@@ -31,6 +39,38 @@ def sum_loop(arr):
     nb_runs = 5000
 
 
+def sum_loop_index(arr):
+    result = 0.0
+    for index in range(500):
+        result += arr[index]
+    return result
+
+
+def init_zeros(arr):
+    for index in range(len(arr)):
+        arr[index] = 0.0
+
+
+def _cort(s1, s2):
+    num = 0.0
+    sum_square_x = 0.0
+    sum_square_y = 0.0
+    for t in range(len(s1) - 1):
+        slope_1 = s1[t + 1] - s1[t]
+        slope_2 = s2[t + 1] - s2[t]
+        num += slope_1 * slope_2
+        sum_square_x += slope_1 * slope_1
+        sum_square_y += slope_2 * slope_2
+    return num / (sqrt(sum_square_x * sum_square_y))
+
+
+def cort(arr):
+    return _cort(arr, arr)
+
+
+compute_from_arr = locals()[name_bench]
+
+
 if method == "_piconumpy_hpy":
     from piconumpy.util_hpy import import_ext
 
@@ -50,12 +90,22 @@ def sum_loop(arr):
 # print(array)
 
 size = 10000
+
+# warming during ~ 1s
+data_as_list = [random() for _ in range(size)]
+arr = array(data_as_list)
+t_start = perf_counter()
+compute_from_arr(arr)
+t_first = perf_counter() - t_start
+for _ in range(round(1 / t_first)):
+    compute_from_arr(arr)
+
 times = []
 for _ in range(nb_runs):
     data_as_list = [random() for _ in range(size)]
     arr = array(data_as_list)
     t_start = perf_counter()
-    sum_loop(arr)
+    compute_from_arr(arr)
     times.append(perf_counter() - t_start)
 
 time = np.median(times)
diff --git a/bench/microbench_loop_sum/bench_cort.jl b/bench/microbench_loop_sum/bench_cort.jl
new file mode 100644
index 0000000..a816541
--- /dev/null
+++ b/bench/microbench_loop_sum/bench_cort.jl
@@ -0,0 +1,35 @@
+using Statistics
+
+
+function cort(s1, s2)
+    num = 0.0
+    sum_square_x = 0.0
+    sum_square_y = 0.0
+    for t in 1:length(s1)-1
+        slope_1 = s1[t + 1] - s1[t]
+        slope_2 = s2[t + 1] - s2[t]
+        num += slope_1 * slope_2
+        sum_square_x += slope_1 * slope_1
+        sum_square_y += slope_2 * slope_2
+    end
+    return num / (sqrt(sum_square_x * sum_square_y))
+end
+
+function use_cort(arr)
+    return cort(arr, arr)
+end
+
+
+compute_from_arr = use_cort
+
+size = 10000
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))
diff --git a/bench/microbench_loop_sum/bench_init_zeros.jl b/bench/microbench_loop_sum/bench_init_zeros.jl
new file mode 100644
index 0000000..b6035e5
--- /dev/null
+++ b/bench/microbench_loop_sum/bench_init_zeros.jl
@@ -0,0 +1,21 @@
+using Statistics
+
+function init_zeros(arr)
+    for i in eachindex(arr)
+        arr[i] = 0.0
+    end
+end
+
+compute_from_arr = init_zeros
+
+size = 10000
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))
diff --git a/bench/microbench_loop_sum/bench_sum_loop_index.jl b/bench/microbench_loop_sum/bench_sum_loop_index.jl
new file mode 100644
index 0000000..458e6c5
--- /dev/null
+++ b/bench/microbench_loop_sum/bench_sum_loop_index.jl
@@ -0,0 +1,23 @@
+using Statistics
+
+function sum_loop_index(arr)
+    result = 0.
+    for i = 1:500
+        result += arr[i]
+    end
+    return result
+end
+
+compute_from_arr = sum_loop_index
+
+size = 10000
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))

From b0143143213ba9967b78b547fe4ffef5216c495e Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Wed, 8 Dec 2021 17:21:15 +0100
Subject: [PATCH 13/32] New dir microbench_low_level

---
 bench/{microbench_loop_sum => microbench_low_level}/Makefile      | 0
 bench/{microbench_loop_sum => microbench_low_level}/README.md     | 0
 bench/{microbench_loop_sum => microbench_low_level}/bench.jl      | 0
 bench/{microbench_loop_sum => microbench_low_level}/bench.py      | 0
 bench/{microbench_loop_sum => microbench_low_level}/bench_cort.jl | 0
 .../bench_init_zeros.jl                                           | 0
 .../bench_sum_loop_index.jl                                       | 0
 7 files changed, 0 insertions(+), 0 deletions(-)
 rename bench/{microbench_loop_sum => microbench_low_level}/Makefile (100%)
 rename bench/{microbench_loop_sum => microbench_low_level}/README.md (100%)
 rename bench/{microbench_loop_sum => microbench_low_level}/bench.jl (100%)
 rename bench/{microbench_loop_sum => microbench_low_level}/bench.py (100%)
 rename bench/{microbench_loop_sum => microbench_low_level}/bench_cort.jl (100%)
 rename bench/{microbench_loop_sum => microbench_low_level}/bench_init_zeros.jl (100%)
 rename bench/{microbench_loop_sum => microbench_low_level}/bench_sum_loop_index.jl (100%)

diff --git a/bench/microbench_loop_sum/Makefile b/bench/microbench_low_level/Makefile
similarity index 100%
rename from bench/microbench_loop_sum/Makefile
rename to bench/microbench_low_level/Makefile
diff --git a/bench/microbench_loop_sum/README.md b/bench/microbench_low_level/README.md
similarity index 100%
rename from bench/microbench_loop_sum/README.md
rename to bench/microbench_low_level/README.md
diff --git a/bench/microbench_loop_sum/bench.jl b/bench/microbench_low_level/bench.jl
similarity index 100%
rename from bench/microbench_loop_sum/bench.jl
rename to bench/microbench_low_level/bench.jl
diff --git a/bench/microbench_loop_sum/bench.py b/bench/microbench_low_level/bench.py
similarity index 100%
rename from bench/microbench_loop_sum/bench.py
rename to bench/microbench_low_level/bench.py
diff --git a/bench/microbench_loop_sum/bench_cort.jl b/bench/microbench_low_level/bench_cort.jl
similarity index 100%
rename from bench/microbench_loop_sum/bench_cort.jl
rename to bench/microbench_low_level/bench_cort.jl
diff --git a/bench/microbench_loop_sum/bench_init_zeros.jl b/bench/microbench_low_level/bench_init_zeros.jl
similarity index 100%
rename from bench/microbench_loop_sum/bench_init_zeros.jl
rename to bench/microbench_low_level/bench_init_zeros.jl
diff --git a/bench/microbench_loop_sum/bench_sum_loop_index.jl b/bench/microbench_low_level/bench_sum_loop_index.jl
similarity index 100%
rename from bench/microbench_loop_sum/bench_sum_loop_index.jl
rename to bench/microbench_low_level/bench_sum_loop_index.jl

From 01effe959c449ee673f79c202a5be65098e8e5d1 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Wed, 8 Dec 2021 22:02:44 +0100
Subject: [PATCH 14/32] Improve output benchmarks

---
 bench/microbench_low_level/Makefile           | 15 +++++--
 bench/microbench_low_level/README.md          | 11 ++++-
 bench/microbench_low_level/bench.py           |  9 +---
 .../bench_sum_loop_index.jl                   |  2 +-
 bench/microbench_low_level/result_sum_loop.md | 41 +++++++++++++++++++
 5 files changed, 63 insertions(+), 15 deletions(-)
 create mode 100644 bench/microbench_low_level/result_sum_loop.md

diff --git a/bench/microbench_low_level/Makefile b/bench/microbench_low_level/Makefile
index c8f534d..0dba15a 100644
--- a/bench/microbench_low_level/Makefile
+++ b/bench/microbench_low_level/Makefile
@@ -1,6 +1,8 @@
 
+.PHONY : clean print_info bench bench_sum_loop_index bench_cort bench_init_zeros
+
 bench: tmp_julia_sum_loop.txt
-	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
+	@$(MAKE) --no-print-directory print_info NAME_BENCH=sum_loop
 	@python bench.py list
 	@python bench.py purepy
 	@python bench.py numpy
@@ -8,7 +10,7 @@ bench: tmp_julia_sum_loop.txt
 	@python bench.py _piconumpy_cpython_capi
 
 bench_sum_loop_index: tmp_julia_sum_loop_index.txt
-	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
+	@$(MAKE) --no-print-directory print_info NAME_BENCH=sum_loop_index
 	@python bench.py list sum_loop_index
 	@python bench.py purepy sum_loop_index
 	@python bench.py numpy sum_loop_index
@@ -16,7 +18,7 @@ bench_sum_loop_index: tmp_julia_sum_loop_index.txt
 	@python bench.py _piconumpy_cpython_capi sum_loop_index
 
 bench_cort: tmp_julia_cort.txt
-	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
+	@$(MAKE) --no-print-directory print_info NAME_BENCH=cort
 	@python bench.py list cort
 	@python bench.py purepy cort
 	@python bench.py numpy cort
@@ -24,13 +26,18 @@ bench_cort: tmp_julia_cort.txt
 	@python bench.py _piconumpy_cpython_capi cort
 
 bench_init_zeros: tmp_julia_init_zeros.txt
-	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
+	@$(MAKE) --no-print-directory print_info NAME_BENCH=init_zeros
 	@python bench.py list init_zeros
 	@python bench.py purepy init_zeros
 	@python bench.py numpy init_zeros
 	@python bench.py _piconumpy_hpy init_zeros
 	@python bench.py _piconumpy_cpython_capi init_zeros
 
+print_info:
+	@echo bench $(NAME_BENCH)
+	@python -c "from socket import gethostname as f; print('hostname:', f())"
+	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
+
 tmp_julia_sum_loop.txt: bench.jl
 	@julia bench.jl > tmp_julia_sum_loop.txt
 
diff --git a/bench/microbench_low_level/README.md b/bench/microbench_low_level/README.md
index dade95a..902b1dd 100644
--- a/bench/microbench_low_level/README.md
+++ b/bench/microbench_low_level/README.md
@@ -1,6 +1,13 @@
-# Microbenchmark sum_loop
+# Microbenchmarks low level Python code
 
-We measure the performance for this function:
+We measure the performance for functions containing low level Python code.
+
+- `sum_loop`: `for value in arr` and summation
+- `sum_loop_sum_loop_index`: `for index in range(5000)`
+- `init_zeros`: set values to zeros
+- `cort`: normalized cosine similarity measure between derivatives
+
+We measure the performance for functions containing low level Python code.
 
 ```python
 def sum_loop(arr):
diff --git a/bench/microbench_low_level/bench.py b/bench/microbench_low_level/bench.py
index 321fe5e..5813d6b 100644
--- a/bench/microbench_low_level/bench.py
+++ b/bench/microbench_low_level/bench.py
@@ -6,8 +6,6 @@
 
 import numpy as np
 
-on_pypy = sys.implementation.name == 'pypy'
-
 try:
     method = sys.argv[1]
 except IndexError:
@@ -27,21 +25,16 @@
     print(f"{tmp_result_julia} does not exist. First execute with `make`")
 
 
-
 def sum_loop(arr):
     result = 0.0
     for value in arr:
         result += value
     return result
 
-nb_runs = 500
-if on_pypy and method in ["list", "_piconumpy_hpy", "purepy"]:
-    nb_runs = 5000
-
 
 def sum_loop_index(arr):
     result = 0.0
-    for index in range(500):
+    for index in range(5000):
         result += arr[index]
     return result
 
diff --git a/bench/microbench_low_level/bench_sum_loop_index.jl b/bench/microbench_low_level/bench_sum_loop_index.jl
index 458e6c5..b4c682c 100644
--- a/bench/microbench_low_level/bench_sum_loop_index.jl
+++ b/bench/microbench_low_level/bench_sum_loop_index.jl
@@ -2,7 +2,7 @@ using Statistics
 
 function sum_loop_index(arr)
     result = 0.
-    for i = 1:500
+    for i = 1:5000
         result += arr[i]
     end
     return result
diff --git a/bench/microbench_low_level/result_sum_loop.md b/bench/microbench_low_level/result_sum_loop.md
new file mode 100644
index 0000000..fd649cb
--- /dev/null
+++ b/bench/microbench_low_level/result_sum_loop.md
@@ -0,0 +1,41 @@
+# Microbenchmark sum_loop
+
+We measure the performance for this function:
+
+```python
+def sum_loop(arr):
+    result = 0.0
+    for value in arr:
+        result += value
+    return result
+```
+
+One can run the benchmarks with `make`.
+
+With PyPy3.7, I get:
+
+```
+bench sum_loop
+hostname: voyage
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
+list                          : 2.34e-05 s (  1.7 * Julia)
+purepy                        : 2.41e-05 s (  1.8 * Julia)
+numpy                         : 8.91e-03 s (654.4 * Julia)
+_piconumpy_hpy                : 3.37e-04 s ( 24.8 * Julia)
+_piconumpy_cpython_capi       : 2.04e-03 s (150.1 * Julia)
+```
+
+With CPython:
+
+```
+bench sum_loop
+hostname: voyage
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
+list                          : 3.59e-04 s ( 26.4 * Julia)
+purepy                        : 2.18e-03 s (160.3 * Julia)
+numpy                         : 1.12e-03 s ( 82.5 * Julia)
+_piconumpy_hpy                : 6.30e-04 s ( 46.2 * Julia)
+_piconumpy_cpython_capi       : 5.04e-04 s ( 37.0 * Julia)
+```

From b653dfd3b4f5257ccb2ea807dd23faa6afd0d4c9 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Thu, 9 Dec 2021 10:34:47 +0100
Subject: [PATCH 15/32] Clean up Makefile + result files

---
 bench/microbench_low_level/Makefile           | 61 +++++++------------
 bench/microbench_low_level/README.md          | 15 +++--
 .../{bench.jl => bench_sum_loop.jl}           |  0
 bench/microbench_low_level/result_cort.md     | 50 +++++++++++++++
 .../microbench_low_level/result_init_zeros.md | 39 ++++++++++++
 bench/microbench_low_level/result_sum_loop.md |  2 +-
 .../result_sum_loop_index.md                  | 41 +++++++++++++
 7 files changed, 165 insertions(+), 43 deletions(-)
 rename bench/microbench_low_level/{bench.jl => bench_sum_loop.jl} (100%)
 create mode 100644 bench/microbench_low_level/result_cort.md
 create mode 100644 bench/microbench_low_level/result_init_zeros.md
 create mode 100644 bench/microbench_low_level/result_sum_loop_index.md

diff --git a/bench/microbench_low_level/Makefile b/bench/microbench_low_level/Makefile
index 0dba15a..e9f350e 100644
--- a/bench/microbench_low_level/Makefile
+++ b/bench/microbench_low_level/Makefile
@@ -1,45 +1,30 @@
 
-.PHONY : clean print_info bench bench_sum_loop_index bench_cort bench_init_zeros
-
-bench: tmp_julia_sum_loop.txt
-	@$(MAKE) --no-print-directory print_info NAME_BENCH=sum_loop
-	@python bench.py list
-	@python bench.py purepy
-	@python bench.py numpy
-	@python bench.py _piconumpy_hpy
-	@python bench.py _piconumpy_cpython_capi
-
-bench_sum_loop_index: tmp_julia_sum_loop_index.txt
-	@$(MAKE) --no-print-directory print_info NAME_BENCH=sum_loop_index
-	@python bench.py list sum_loop_index
-	@python bench.py purepy sum_loop_index
-	@python bench.py numpy sum_loop_index
-	@python bench.py _piconumpy_hpy sum_loop_index
-	@python bench.py _piconumpy_cpython_capi sum_loop_index
-
-bench_cort: tmp_julia_cort.txt
-	@$(MAKE) --no-print-directory print_info NAME_BENCH=cort
-	@python bench.py list cort
-	@python bench.py purepy cort
-	@python bench.py numpy cort
-	@python bench.py _piconumpy_hpy cort
-	@python bench.py _piconumpy_cpython_capi cort
-
-bench_init_zeros: tmp_julia_init_zeros.txt
-	@$(MAKE) --no-print-directory print_info NAME_BENCH=init_zeros
-	@python bench.py list init_zeros
-	@python bench.py purepy init_zeros
-	@python bench.py numpy init_zeros
-	@python bench.py _piconumpy_hpy init_zeros
-	@python bench.py _piconumpy_cpython_capi init_zeros
-
-print_info:
+.PHONY : clean bench_sum_loop bench_sum_loop_index bench_cort bench_init_zeros
+
+bench_sum_loop: NAME_BENCH=sum_loop
+bench_sum_loop: tmp_julia_sum_loop.txt _bench
+
+bench_sum_loop_index: NAME_BENCH=sum_loop_index
+bench_sum_loop_index: tmp_julia_sum_loop_index.txt _bench
+
+bench_cort: NAME_BENCH=cort
+bench_cort: tmp_julia_cort.txt _bench
+
+bench_init_zeros: NAME_BENCH=init_zeros
+bench_init_zeros: tmp_julia_init_zeros.txt _bench
+
+_bench:
 	@echo bench $(NAME_BENCH)
 	@python -c "from socket import gethostname as f; print('hostname:', f())"
 	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
-
-tmp_julia_sum_loop.txt: bench.jl
-	@julia bench.jl > tmp_julia_sum_loop.txt
+	@python bench.py list $(NAME_BENCH)
+	@python bench.py purepy $(NAME_BENCH)
+	@python bench.py numpy $(NAME_BENCH)
+	@python bench.py _piconumpy_hpy $(NAME_BENCH)
+	@python bench.py _piconumpy_cpython_capi $(NAME_BENCH)
+
+tmp_julia_sum_loop.txt: bench_sum_loop.jl
+	@julia bench_sum_loop.jl > tmp_julia_sum_loop.txt
 
 tmp_julia_sum_loop_index.txt: bench_sum_loop_index.jl
 	@julia bench_sum_loop_index.jl > tmp_julia_sum_loop_index.txt
diff --git a/bench/microbench_low_level/README.md b/bench/microbench_low_level/README.md
index 902b1dd..0f1e32f 100644
--- a/bench/microbench_low_level/README.md
+++ b/bench/microbench_low_level/README.md
@@ -2,10 +2,17 @@
 
 We measure the performance for functions containing low level Python code.
 
-- `sum_loop`: `for value in arr` and summation
-- `sum_loop_sum_loop_index`: `for index in range(5000)`
-- `init_zeros`: set values to zeros
-- `cort`: normalized cosine similarity measure between derivatives
+- `sum_loop` (command `make bench`): `for value in arr` and summation
+
+- `sum_loop_index` (command `make bench_sum_loop_index`):
+  `for index in range(5000)` and summation
+
+- `init_zeros` (command `make bench_init_zeros`): set values to zeros
+
+- `cort` (command `make bench_cort`): normalized cosine similarity measure
+  between derivatives
+
+The files result_*.txt contain few results.
 
 We measure the performance for functions containing low level Python code.
 
diff --git a/bench/microbench_low_level/bench.jl b/bench/microbench_low_level/bench_sum_loop.jl
similarity index 100%
rename from bench/microbench_low_level/bench.jl
rename to bench/microbench_low_level/bench_sum_loop.jl
diff --git a/bench/microbench_low_level/result_cort.md b/bench/microbench_low_level/result_cort.md
new file mode 100644
index 0000000..a1b5a22
--- /dev/null
+++ b/bench/microbench_low_level/result_cort.md
@@ -0,0 +1,50 @@
+# Microbenchmark cort
+
+We measure the performance for this function:
+
+```python
+def cort(arr):
+    return _cort(arr, arr)
+
+def _cort(s1, s2):
+    num = 0.0
+    sum_square_x = 0.0
+    sum_square_y = 0.0
+    for t in range(len(s1) - 1):
+        slope_1 = s1[t + 1] - s1[t]
+        slope_2 = s2[t + 1] - s2[t]
+        num += slope_1 * slope_2
+        sum_square_x += slope_1 * slope_1
+        sum_square_y += slope_2 * slope_2
+    return num / (sqrt(sum_square_x * sum_square_y))
+```
+
+One can run the benchmarks with `make bench_cort`.
+
+With PyPy3.7, I get:
+
+```
+bench cort
+hostname: voyage
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
+list                          : 4.29e-05 s (  1.8 * Julia)
+purepy                        : 3.89e-05 s (  1.6 * Julia)
+numpy                         : 4.72e-02 s (1957.9 * Julia)
+_piconumpy_hpy                : 1.36e-03 s ( 56.5 * Julia)
+_piconumpy_cpython_capi       : 8.03e-03 s (332.8 * Julia)
+```
+
+With CPython:
+
+```
+bench cort
+hostname: voyage
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
+list                          : 4.47e-03 s (185.4 * Julia)
+purepy                        : 1.08e-02 s (448.6 * Julia)
+numpy                         : 9.69e-03 s (401.6 * Julia)
+_piconumpy_hpy                : 5.32e-03 s (220.5 * Julia)
+_piconumpy_cpython_capi       : 4.80e-03 s (198.9 * Julia)
+```
diff --git a/bench/microbench_low_level/result_init_zeros.md b/bench/microbench_low_level/result_init_zeros.md
new file mode 100644
index 0000000..1228d63
--- /dev/null
+++ b/bench/microbench_low_level/result_init_zeros.md
@@ -0,0 +1,39 @@
+# Microbenchmark sum_init_zeros
+
+We measure the performance for this function:
+
+```python
+def init_zeros(arr):
+    for index in range(len(arr)):
+        arr[index] = 0.0
+```
+
+One can run the benchmarks with `make bench_init_zeros`.
+
+With PyPy3.7, I get:
+
+```
+bench init_zeros
+hostname: voyage
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
+list                          : 2.53e-05 s (  5.2 * Julia)
+purepy                        : 2.65e-05 s (  5.4 * Julia)
+numpy                         : 9.01e-03 s (1848.7 * Julia)
+_piconumpy_hpy                : 4.17e-04 s ( 85.6 * Julia)
+_piconumpy_cpython_capi       : 1.10e-03 s (224.9 * Julia)
+```
+
+With CPython:
+
+```
+bench init_zeros
+hostname: voyage
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
+list                          : 5.18e-04 s (106.3 * Julia)
+purepy                        : 1.94e-03 s (397.8 * Julia)
+numpy                         : 1.17e-03 s (239.3 * Julia)
+_piconumpy_hpy                : 6.46e-04 s (132.5 * Julia)
+_piconumpy_cpython_capi       : 5.43e-04 s (111.4 * Julia)
+```
diff --git a/bench/microbench_low_level/result_sum_loop.md b/bench/microbench_low_level/result_sum_loop.md
index fd649cb..556506c 100644
--- a/bench/microbench_low_level/result_sum_loop.md
+++ b/bench/microbench_low_level/result_sum_loop.md
@@ -10,7 +10,7 @@ def sum_loop(arr):
     return result
 ```
 
-One can run the benchmarks with `make`.
+One can run the benchmarks with `make bench_sum_loop`.
 
 With PyPy3.7, I get:
 
diff --git a/bench/microbench_low_level/result_sum_loop_index.md b/bench/microbench_low_level/result_sum_loop_index.md
new file mode 100644
index 0000000..65a11c2
--- /dev/null
+++ b/bench/microbench_low_level/result_sum_loop_index.md
@@ -0,0 +1,41 @@
+# Microbenchmark sum_loop_index
+
+We measure the performance for this function:
+
+```python
+def sum_loop_index(arr):
+    result = 0.0
+    for index in range(5000):
+        result += arr[index]
+    return result
+```
+
+One can run the benchmarks with `make bench_sum_loop_index`.
+
+With PyPy3.7, I get:
+
+```
+bench sum_loop_index
+hostname: voyage
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
+list                          : 1.19e-05 s (  2.0 * Julia)
+purepy                        : 1.20e-05 s (  2.1 * Julia)
+numpy                         : 4.07e-03 s (692.9 * Julia)
+_piconumpy_hpy                : 1.65e-04 s ( 28.2 * Julia)
+_piconumpy_cpython_capi       : 9.95e-04 s (169.5 * Julia)
+```
+
+With CPython:
+
+```
+bench sum_loop_index
+hostname: voyage
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
+list                          : 3.91e-04 s ( 66.6 * Julia)
+purepy                        : 1.10e-03 s (186.9 * Julia)
+numpy                         : 8.95e-04 s (152.5 * Julia)
+_piconumpy_hpy                : 4.85e-04 s ( 82.5 * Julia)
+_piconumpy_cpython_capi       : 4.15e-04 s ( 70.6 * Julia)
+```

From 655e4925869d7027d3dc822404592b27f53682df Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Thu, 9 Dec 2021 14:49:36 +0100
Subject: [PATCH 16/32] Update results with GraalPython

---
 bench/microbench_low_level/bench.py           | 26 +++++++++-----
 bench/microbench_low_level/result_cort.md     | 32 ++++++++++++-----
 .../microbench_low_level/result_init_zeros.md | 32 ++++++++++++-----
 bench/microbench_low_level/result_sum_loop.md | 35 +++++++++++++------
 .../result_sum_loop_index.md                  | 32 ++++++++++++-----
 5 files changed, 111 insertions(+), 46 deletions(-)

diff --git a/bench/microbench_low_level/bench.py b/bench/microbench_low_level/bench.py
index 5813d6b..8524fd5 100644
--- a/bench/microbench_low_level/bench.py
+++ b/bench/microbench_low_level/bench.py
@@ -4,8 +4,6 @@
 from random import random
 from math import sqrt
 
-import numpy as np
-
 try:
     method = sys.argv[1]
 except IndexError:
@@ -72,6 +70,13 @@ def cort(arr):
 elif method == "list":
     array = list
 elif method == "numpy":
+
+    try:
+        import numpy as np
+    except ImportError:
+        print(f"{method:30s}: ImportError numpy")
+        sys.exit(0)
+
     array = np.array
 else:
     d = {}
@@ -80,26 +85,29 @@ def cort(arr):
     if "piconumpy" not in method:
         method = f"piconumpy.{method}"
 
-# print(array)
-
 size = 10000
 
 # warming during ~ 1s
 data_as_list = [random() for _ in range(size)]
 arr = array(data_as_list)
 t_start = perf_counter()
-compute_from_arr(arr)
-t_first = perf_counter() - t_start
-for _ in range(round(1 / t_first)):
+while perf_counter() - t_start < 1.0:
     compute_from_arr(arr)
 
+
+def median(sequence):
+    tmp = sorted(sequence)
+    return tmp[len(tmp) // 2]
+
+
+t0 = perf_counter()
 times = []
-for _ in range(nb_runs):
+while perf_counter() - t0 < 2.0:
     data_as_list = [random() for _ in range(size)]
     arr = array(data_as_list)
     t_start = perf_counter()
     compute_from_arr(arr)
     times.append(perf_counter() - t_start)
 
-time = np.median(times)
+time = median(times)
 print(f"{method:30s}: {time:.2e} s ({time / norm:5.1f} * Julia)")
diff --git a/bench/microbench_low_level/result_cort.md b/bench/microbench_low_level/result_cort.md
index a1b5a22..4b8ff6c 100644
--- a/bench/microbench_low_level/result_cort.md
+++ b/bench/microbench_low_level/result_cort.md
@@ -29,10 +29,10 @@ hostname: voyage
 {'cache_tag': 'pypy37',
  'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
 list                          : 4.29e-05 s (  1.8 * Julia)
-purepy                        : 3.89e-05 s (  1.6 * Julia)
-numpy                         : 4.72e-02 s (1957.9 * Julia)
-_piconumpy_hpy                : 1.36e-03 s ( 56.5 * Julia)
-_piconumpy_cpython_capi       : 8.03e-03 s (332.8 * Julia)
+purepy                        : 4.12e-05 s (  1.7 * Julia)
+numpy                         : 4.77e-02 s (1975.5 * Julia)
+_piconumpy_hpy                : 1.46e-03 s ( 60.5 * Julia)
+_piconumpy_cpython_capi       : 6.96e-03 s (288.5 * Julia)
 ```
 
 With CPython:
@@ -42,9 +42,23 @@ bench cort
 hostname: voyage
 {'cache_tag': 'cpython-39',
  'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
-list                          : 4.47e-03 s (185.4 * Julia)
-purepy                        : 1.08e-02 s (448.6 * Julia)
-numpy                         : 9.69e-03 s (401.6 * Julia)
-_piconumpy_hpy                : 5.32e-03 s (220.5 * Julia)
-_piconumpy_cpython_capi       : 4.80e-03 s (198.9 * Julia)
+list                          : 4.42e-03 s (183.4 * Julia)
+purepy                        : 1.04e-02 s (430.0 * Julia)
+numpy                         : 9.76e-03 s (404.4 * Julia)
+_piconumpy_hpy                : 5.66e-03 s (234.7 * Julia)
+_piconumpy_cpython_capi       : 4.77e-03 s (197.7 * Julia)
+```
+
+With Python 3.8.5 (GraalVM CE Native 21.3.0)
+
+```
+bench cort
+hostname: voyage
+{'cache_tag': 'graalpython-38',
+ 'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
+list                          : 2.44e-05 s (  1.0 * Julia)
+purepy                        : 3.13e-05 s (  1.3 * Julia)
+numpy                         : ImportError numpy
+_piconumpy_hpy                : 1.69e-04 s (  7.0 * Julia)
+_piconumpy_cpython_capi       : 3.55e-04 s ( 14.7 * Julia)
 ```
diff --git a/bench/microbench_low_level/result_init_zeros.md b/bench/microbench_low_level/result_init_zeros.md
index 1228d63..68eee34 100644
--- a/bench/microbench_low_level/result_init_zeros.md
+++ b/bench/microbench_low_level/result_init_zeros.md
@@ -17,11 +17,11 @@ bench init_zeros
 hostname: voyage
 {'cache_tag': 'pypy37',
  'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
-list                          : 2.53e-05 s (  5.2 * Julia)
-purepy                        : 2.65e-05 s (  5.4 * Julia)
-numpy                         : 9.01e-03 s (1848.7 * Julia)
-_piconumpy_hpy                : 4.17e-04 s ( 85.6 * Julia)
-_piconumpy_cpython_capi       : 1.10e-03 s (224.9 * Julia)
+list                          : 2.63e-05 s (  5.4 * Julia)
+purepy                        : 2.99e-05 s (  6.1 * Julia)
+numpy                         : 1.17e-02 s (2403.5 * Julia)
+_piconumpy_hpy                : 4.58e-04 s ( 94.1 * Julia)
+_piconumpy_cpython_capi       : 8.46e-04 s (173.6 * Julia)
 ```
 
 With CPython:
@@ -31,9 +31,23 @@ bench init_zeros
 hostname: voyage
 {'cache_tag': 'cpython-39',
  'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
-list                          : 5.18e-04 s (106.3 * Julia)
-purepy                        : 1.94e-03 s (397.8 * Julia)
+list                          : 5.34e-04 s (109.6 * Julia)
+purepy                        : 2.03e-03 s (417.4 * Julia)
 numpy                         : 1.17e-03 s (239.3 * Julia)
-_piconumpy_hpy                : 6.46e-04 s (132.5 * Julia)
-_piconumpy_cpython_capi       : 5.43e-04 s (111.4 * Julia)
+_piconumpy_hpy                : 7.51e-04 s (154.1 * Julia)
+_piconumpy_cpython_capi       : 5.44e-04 s (111.5 * Julia)
+```
+
+With Python 3.8.5 (GraalVM CE Native 21.3.0)
+
+```
+bench init_zeros
+hostname: voyage
+{'cache_tag': 'graalpython-38',
+ 'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
+list                          : 1.37e-05 s (  2.8 * Julia)
+purepy                        : 1.93e-05 s (  4.0 * Julia)
+numpy                         : ImportError numpy
+_piconumpy_hpy                : 4.68e-05 s (  9.6 * Julia)
+_piconumpy_cpython_capi       : 1.74e-04 s ( 35.8 * Julia)
 ```
diff --git a/bench/microbench_low_level/result_sum_loop.md b/bench/microbench_low_level/result_sum_loop.md
index 556506c..f2d0a86 100644
--- a/bench/microbench_low_level/result_sum_loop.md
+++ b/bench/microbench_low_level/result_sum_loop.md
@@ -19,11 +19,11 @@ bench sum_loop
 hostname: voyage
 {'cache_tag': 'pypy37',
  'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
-list                          : 2.34e-05 s (  1.7 * Julia)
-purepy                        : 2.41e-05 s (  1.8 * Julia)
-numpy                         : 8.91e-03 s (654.4 * Julia)
-_piconumpy_hpy                : 3.37e-04 s ( 24.8 * Julia)
-_piconumpy_cpython_capi       : 2.04e-03 s (150.1 * Julia)
+list                          : 2.35e-05 s (  1.8 * Julia)
+purepy                        : 2.60e-05 s (  2.0 * Julia)
+numpy                         : 8.97e-03 s (677.0 * Julia)
+_piconumpy_hpy                : 3.73e-04 s ( 28.2 * Julia)
+_piconumpy_cpython_capi       : 1.75e-03 s (132.4 * Julia)
 ```
 
 With CPython:
@@ -33,9 +33,24 @@ bench sum_loop
 hostname: voyage
 {'cache_tag': 'cpython-39',
  'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
-list                          : 3.59e-04 s ( 26.4 * Julia)
-purepy                        : 2.18e-03 s (160.3 * Julia)
-numpy                         : 1.12e-03 s ( 82.5 * Julia)
-_piconumpy_hpy                : 6.30e-04 s ( 46.2 * Julia)
-_piconumpy_cpython_capi       : 5.04e-04 s ( 37.0 * Julia)
+list                          : 3.65e-04 s ( 27.5 * Julia)
+purepy                        : 2.17e-03 s (164.1 * Julia)
+numpy                         : 1.09e-03 s ( 82.2 * Julia)
+_piconumpy_hpy                : 7.39e-04 s ( 55.8 * Julia)
+_piconumpy_cpython_capi       : 5.07e-04 s ( 38.3 * Julia)
+
+```
+
+With Python 3.8.5 (GraalVM CE Native 21.3.0)
+
+```
+bench sum_loop
+hostname: voyage
+{'cache_tag': 'graalpython-38',
+ 'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
+list                          : 1.92e-05 s (  1.4 * Julia)
+purepy                        : 3.61e-05 s (  2.7 * Julia)
+numpy                         : ImportError numpy
+_piconumpy_hpy                : 5.03e-04 s ( 38.0 * Julia)
+_piconumpy_cpython_capi       : 2.90e-03 s (219.1 * Julia)
 ```
diff --git a/bench/microbench_low_level/result_sum_loop_index.md b/bench/microbench_low_level/result_sum_loop_index.md
index 65a11c2..4a56d2b 100644
--- a/bench/microbench_low_level/result_sum_loop_index.md
+++ b/bench/microbench_low_level/result_sum_loop_index.md
@@ -20,10 +20,10 @@ hostname: voyage
 {'cache_tag': 'pypy37',
  'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
 list                          : 1.19e-05 s (  2.0 * Julia)
-purepy                        : 1.20e-05 s (  2.1 * Julia)
-numpy                         : 4.07e-03 s (692.9 * Julia)
-_piconumpy_hpy                : 1.65e-04 s ( 28.2 * Julia)
-_piconumpy_cpython_capi       : 9.95e-04 s (169.5 * Julia)
+purepy                        : 1.64e-05 s (  2.8 * Julia)
+numpy                         : 4.18e-03 s (711.4 * Julia)
+_piconumpy_hpy                : 1.73e-04 s ( 29.4 * Julia)
+_piconumpy_cpython_capi       : 8.44e-04 s (143.8 * Julia)
 ```
 
 With CPython:
@@ -33,9 +33,23 @@ bench sum_loop_index
 hostname: voyage
 {'cache_tag': 'cpython-39',
  'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
-list                          : 3.91e-04 s ( 66.6 * Julia)
-purepy                        : 1.10e-03 s (186.9 * Julia)
-numpy                         : 8.95e-04 s (152.5 * Julia)
-_piconumpy_hpy                : 4.85e-04 s ( 82.5 * Julia)
-_piconumpy_cpython_capi       : 4.15e-04 s ( 70.6 * Julia)
+list                          : 3.91e-04 s ( 66.5 * Julia)
+purepy                        : 1.11e-03 s (188.3 * Julia)
+numpy                         : 8.93e-04 s (152.1 * Julia)
+_piconumpy_hpy                : 5.42e-04 s ( 92.3 * Julia)
+_piconumpy_cpython_capi       : 4.17e-04 s ( 71.0 * Julia)
+```
+
+With Python 3.8.5 (GraalVM CE Native 21.3.0)
+
+```
+bench sum_loop_index
+hostname: voyage
+{'cache_tag': 'graalpython-38',
+ 'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
+list                          : 1.36e-05 s (  2.3 * Julia)
+purepy                        : 1.81e-05 s (  3.1 * Julia)
+numpy                         : ImportError numpy
+_piconumpy_hpy                : 3.68e-05 s (  6.3 * Julia)
+_piconumpy_cpython_capi       : 1.08e-04 s ( 18.5 * Julia)
 ```

From 2a8cbf2160f3ac0b93eb9648dc7751f8a29dd5b6 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Thu, 9 Dec 2021 16:58:51 +0100
Subject: [PATCH 17/32] Cleanup after rebase

---
 bench/microbench_low_level/Makefile           |   2 +-
 bench/microbench_low_level/README.md          | 180 ------------------
 bench/microbench_low_level/bench.py           |   1 +
 bench/microbench_low_level/result_sum_loop.md | 146 ++++++++++++++
 4 files changed, 148 insertions(+), 181 deletions(-)

diff --git a/bench/microbench_low_level/Makefile b/bench/microbench_low_level/Makefile
index e9f350e..256dc90 100644
--- a/bench/microbench_low_level/Makefile
+++ b/bench/microbench_low_level/Makefile
@@ -38,7 +38,7 @@ tmp_julia_init_zeros.txt: bench_init_zeros.jl
 clean:
 	rm -f tmp_*.txt
 
-produce_traces: tmp_result_julia.txt
+produce_traces: tmp_julia_sum_loop.txt
 	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp_traces_list.txt pypy bench.py list
 	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp_traces_piconumpy_list.txt pypy bench.py purepy
 	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp_traces_piconumpy_hpy.txt pypy bench.py _piconumpy_hpy
diff --git a/bench/microbench_low_level/README.md b/bench/microbench_low_level/README.md
index 0f1e32f..b8778f2 100644
--- a/bench/microbench_low_level/README.md
+++ b/bench/microbench_low_level/README.md
@@ -13,183 +13,3 @@ We measure the performance for functions containing low level Python code.
   between derivatives
 
 The files result_*.txt contain few results.
-
-We measure the performance for functions containing low level Python code.
-
-```python
-def sum_loop(arr):
-    result = 0.0
-    for value in arr:
-        result += value
-    return result
-```
-
-One can run the benchmarks with `make`.
-
-With PyPy3.7, I get:
-
-```
-{'cache_tag': 'pypy37',
- 'version': sys.pypy_version_info(major=7, minor=3, micro=6, releaselevel='final', serial=0)}
-list                          : 1.34e-05 s (  1.3 * Julia)
-piconumpy.purepy              : 1.33e-05 s (  1.3 * Julia)
-numpy                         : 4.00e-03 s (376.6 * Julia)
-_piconumpy_hpy                : 1.99e-04 s ( 18.8 * Julia)
-_piconumpy_cpython_capi       : 1.27e-03 s (119.5 * Julia)
-```
-
-With CPython:
-
-```
-{'cache_tag': 'cpython-39',
- 'version': sys.version_info(major=3, minor=9, micro=6, releaselevel='final', serial=0)}
-list                          : 2.62e-04 s ( 24.6 * Julia)
-piconumpy.purepy              : 1.25e-03 s (117.5 * Julia)
-numpy                         : 7.35e-04 s ( 69.2 * Julia)
-_piconumpy_hpy                : 4.26e-04 s ( 40.2 * Julia)
-_piconumpy_cpython_capi       : 3.52e-04 s ( 33.1 * Julia)
-```
-
-- PyPy is fast with list (1.3 * Julia, same order of magnitude that with Julia)
-and as fast for a piconumpy array based on a list ("piconumpy.purepy", zero
-cost abstraction!)
-
-- Numpy and _piconumpy_cpython_capi are both much slower with PyPy than with
-Cpython. We can guess that the Numpy port to HPy would fix that.
-
-- piconumpy_hpy is a bit faster with PyPy (19 * Julia) than with CPython (40 *
-Julia), however, we see that PyPy does not strongly accelerate piconumpy_hpy
-(19 * Julia, 14 * piconumpy_list).
-
-## Traces PyPy `sum_loop`
-
-### List
-
-```
-+557: label(p0, p1, p6, p9, f35, f30, p15, p22, p26, i32, i27, p29, descr=TargetToken(140447503809120))
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
-+606: i44 = uint_ge(i32, i27)
-guard_false(i44, descr=<Guard0x7fbc7b939a00>) [p0, p6, p9, p15, p1, i32, i27, i44, p26, f30, f35]
-+615: f45 = getarrayitem_gc_f(p29, i32, descr=<ArrayF 8>)
-+622: i47 = int_add(i32, 1)
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#12 STORE_FAST')
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#14 LOAD_FAST')
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#16 LOAD_FAST')
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#18 INPLACE_ADD')
-+626: f48 = float_add(f35, f45)
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#20 STORE_FAST')
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#22 JUMP_ABSOLUTE')
-+630: setfield_gc(p15, i47, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
-+634: guard_not_invalidated(descr=<Guard0x7fbc7b939a60>) [p0, p6, p9, p15, p1, f45, f48, None, None]
-+634: i51 = getfield_raw_i(140447672379264, descr=<FieldS pypysig_long_struct.c_value 0>)
-+647: i53 = int_sub(i51, 1)
-+651: setfield_raw(140447672379264, i53, descr=<FieldS pypysig_long_struct.c_value 0>)
-+654: i56 = int_lt(i53, 0)
-+658: guard_false(i56, descr=<Guard0x7fbc7b939ac0>) [p0, p6, p9, p15, p1, i53, f45, f48, None, None]
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
-+664: i57 = arraylen_gc(p29, descr=<ArrayF 8>)
-+664: jump(p0, p1, p6, p9, f48, f45, p15, p22, p26, i47, i27, p29, descr=TargetToken(140447503809120))
-```
-
-### piconumpy purepy (based on list)
-
-```
-+705: label(p0, p1, p6, p9, f53, f46, p15, p22, i49, p29, p38, p42, i43, p45, descr=TargetToken(139748702723776))
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
-+760: guard_not_invalidated(descr=<Guard0x7f19c7c97d60>) [p0, p6, p9, p15, p1, p22, i49, f46, f53]
-+760: p62 = force_token()
-+760: enter_portal_frame(21, 28364)
-debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#0 LOAD_FAST')
-debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#2 LOAD_ATTR')
-debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#4 LOAD_FAST')
-debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#6 BINARY_SUBSCR')
-+760: i65 = uint_ge(i49, i43)
-+763: guard_false(i65, descr=<Guard0x7f19ba0b44a0>) [p0, p6, p9, p15, p1, p22, i49, f46, f53]
-+769: f66 = getarrayitem_gc_f(p45, i49, descr=<ArrayF 8>)
-debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#8 RETURN_VALUE')
-+776: leave_portal_frame(21)
-+776: i69 = int_add(i49, 1)
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#12 STORE_FAST')
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#14 LOAD_FAST')
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#16 LOAD_FAST')
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#18 INPLACE_ADD')
-+780: f70 = float_add(f53, f66)
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#20 STORE_FAST')
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#22 JUMP_ABSOLUTE')
-+784: i72 = getfield_raw_i(139748871243648, descr=<FieldS pypysig_long_struct.c_value 0>)
-+797: i74 = int_sub(i72, 3)
-+801: setfield_raw(139748871243648, i74, descr=<FieldS pypysig_long_struct.c_value 0>)
-+804: setfield_gc(p15, i69, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
-+808: i77 = int_lt(i74, 0)
-+812: guard_false(i77, descr=<Guard0x7f19c7c97dc0>) [p0, p6, p9, p15, p1, i74, f66, f70, None, None, None]
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
-+818: i78 = arraylen_gc(p45, descr=<ArrayF 8>)
-+818: jump(p0, p1, p6, p9, f70, f66, p15, p22, i69, p29, p38, p42, i43, p45, descr=TargetToken(139748702723776))
-```
-
-### piconumpy hpy
-
-```
-+1339: label(p0, p1, p6, p9, f73, p63, p15, i68, p62, descr=TargetToken(139865876151520))
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
-+1352: p82 = getfield_gc_r(p15, descr=<FieldP pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_w_seq 16>)
-+1356: guard_nonnull_class(p82, 139866025815200, descr=<Guard0x7f350fde1a60>) [p0, p6, p9, p63, p15, p1, p82, f73]
-+1376: p84 = getfield_gc_r(p82, descr=<FieldP pypy.interpreter.typedef.W_HPyObjectUserDictWeakrefable.inst_map 16>)
-+1387: guard_value(p84, ConstPtr(ptr85), descr=<Guard0x7f35021fe0b0>) [p0, p6, p9, p63, p15, p1, p82, f73]
-+1396: guard_not_invalidated(descr=<Guard0x7f35021fe0f8>) [p0, p6, p9, p63, p15, p1, p82, f73]
-+1403: p87 = getfield_gc_r(ConstPtr(ptr86), descr=<FieldP pypy.module._hpy_universal.interp_slot.W_SlotWrapper.inst_w_objclass 32>)
-+1414: guard_value(p87, ConstPtr(ptr88), descr=<Guard0x7f350fde1ac0>) [p0, p6, p9, p63, p15, p1, p82, f73]
-+1423: i90 = getfield_gc_i(ConstPtr(ptr89), descr=<FieldU pypy.module._hpy_universal.interp_slot.W_SlotWrapper.inst_cfuncptr 8>)
-+1427: i92 = int_lt(i68, 0)
-+1431: guard_false(i92, descr=<Guard0x7f35021fe140>) [p0, p6, p9, p63, p15, p1, p82, f73]
-+1444: i94 = getfield_gc_i(ConstPtr(ptr93), descr=<FieldS list.length 8>)
-+1448: i95 = int_is_zero(i94)
-+1451: guard_false(i95, descr=<Guard0x7f35021fe188>) [p0, p6, p9, p63, p15, p1, p82, f73]
-+1457: i97 = int_sub(i94, 1)
-+1461: p99 = getfield_gc_r(ConstPtr(ptr98), descr=<FieldP list.items 16>)
-+1465: i100 = getarrayitem_gc_i(p99, i97, descr=<ArrayS 8>)
-+1470: i101 = arraylen_gc(p99, descr=<ArrayS 8>)
-+1474: i103 = int_rshift(i101, 1)
-+1477: i105 = int_sub(i103, 5)
-+1481: i106 = int_lt(i97, i105)
-+1484: cond_call(i106, ConstClass(_ll_list_resize_hint_really_look_inside_iff__listPtr_Signed_Bool), ConstPtr(ptr108), i97, 0, descr=<Callv 0 rii EF=5>)
-+1490: guard_no_exception(descr=<Guard0x7f350fde1b20>) [p0, p6, p9, p63, p15, p1, i68, i90, i100, p82, i97, f73]
-+1490: setfield_gc(ConstPtr(ptr110), i97, descr=<FieldS list.length 8>)
-+1494: i112 = int_lt(i100, 0)
-+1498: guard_false(i112, descr=<Guard0x7f350fde1b80>) [p0, p6, p9, p63, p15, p1, i68, i90, i100, p82, f73]
-+1522: setarrayitem_gc(p62, i100, p82, descr=<ArrayP 8>)
-+1527: p113 = force_token()
-+1548: setfield_gc(p0, p113, descr=<FieldP pypy.interpreter.pyframe.PyFrame.vable_token 8>)
-+1552: i115 = call_may_force_i(i90, 139866044538144, i100, i68, descr=<Calli 8 iii EF=7>)
-+1663: guard_not_forced(descr=<Guard0x7f350fdfabe8>) [p0, p6, p9, p63, p15, p1, i100, i115, i68, p82, f73]
-+1674: guard_no_exception(descr=<Guard0x7f35021fe1d0>) [p0, p6, p9, p63, p15, p1, i100, i115, i68, p82, f73]
-+1688: call_n(ConstClass(close), i100, descr=<Callv 0 i EF=5>)
-+1754: guard_no_exception(descr=<Guard0x7f350fde1be0>) [p0, p6, p9, p63, p15, p1, i115, i68, p82, f73]
-+1768: i117 = int_is_true(i115)
-+1771: guard_true(i117, descr=<Guard0x7f35021fe218>) [p0, p6, p9, p63, p15, p1, i115, i68, p82, f73]
-+1784: p119 = getfield_gc_r(ConstPtr(ptr118), descr=<FieldP list.items 16>)
-+1788: p120 = getarrayitem_gc_r(p119, i115, descr=<ArrayP 8>)
-+1793: call_n(ConstClass(close), i115, descr=<Callv 0 i EF=5>)
-+1866: guard_no_exception(descr=<Guard0x7f350fde1c40>) [p0, p6, p9, p63, p15, p1, p120, i68, p82, f73]
-+1880: guard_nonnull_class(p120, ConstClass(W_FloatObject), descr=<Guard0x7f350fde1ca0>) [p0, p6, p9, p63, p15, p1, p120, i68, p82, f73]
-+1907: i123 = getfield_gc_i(p15, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
-+1918: i125 = int_add(i123, 1)
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#12 STORE_FAST')
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#14 LOAD_FAST')
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#16 LOAD_FAST')
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#18 INPLACE_ADD')
-+1923: setfield_gc(p15, i125, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
-+1927: f126 = getfield_gc_f(p120, descr=<FieldF pypy.objspace.std.floatobject.W_FloatObject.inst_floatval 8 pure>)
-+1933: f127 = float_add(f73, f126)
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#20 STORE_FAST')
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#22 JUMP_ABSOLUTE')
-+1947: guard_not_invalidated(descr=<Guard0x7f350fde1d00>) [p0, p6, p9, p120, p15, p1, f127, None, None, None]
-+1947: i129 = getfield_raw_i(139866044675968, descr=<FieldS pypysig_long_struct.c_value 0>)
-+1960: i131 = int_sub(i129, 3)
-+1964: setfield_raw(139866044675968, i131, descr=<FieldS pypysig_long_struct.c_value 0>)
-+1967: i134 = int_lt(i131, 0)
-+1971: guard_false(i134, descr=<Guard0x7f350fde1d60>) [p0, p6, p9, p120, p15, p1, i131, f127, None, None, None]
-debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
-+1977: i135 = arraylen_gc(p119, descr=<ArrayP 8>)
-+1977: jump(p0, p1, p6, p9, f127, p120, p15, i125, p119, descr=TargetToken(139865876151520))
-```
diff --git a/bench/microbench_low_level/bench.py b/bench/microbench_low_level/bench.py
index 8524fd5..305c917 100644
--- a/bench/microbench_low_level/bench.py
+++ b/bench/microbench_low_level/bench.py
@@ -100,6 +100,7 @@ def median(sequence):
     return tmp[len(tmp) // 2]
 
 
+# measure during ~ 2s
 t0 = perf_counter()
 times = []
 while perf_counter() - t0 < 2.0:
diff --git a/bench/microbench_low_level/result_sum_loop.md b/bench/microbench_low_level/result_sum_loop.md
index f2d0a86..48e32f2 100644
--- a/bench/microbench_low_level/result_sum_loop.md
+++ b/bench/microbench_low_level/result_sum_loop.md
@@ -54,3 +54,149 @@ numpy                         : ImportError numpy
 _piconumpy_hpy                : 5.03e-04 s ( 38.0 * Julia)
 _piconumpy_cpython_capi       : 2.90e-03 s (219.1 * Julia)
 ```
+
+## Summary
+
+- PyPy is fast with list (1.3 * Julia, same order of magnitude that with Julia)
+and as fast for a piconumpy array based on a list ("piconumpy.purepy", zero
+cost abstraction!)
+
+- Numpy and _piconumpy_cpython_capi are both much slower with PyPy than with
+Cpython. We can guess that the Numpy port to HPy would fix that.
+
+- piconumpy_hpy is a bit faster with PyPy (19 * Julia) than with CPython (40 *
+Julia), however, we see that PyPy does not strongly accelerate piconumpy_hpy
+(19 * Julia, 14 * piconumpy_list).
+
+## Traces PyPy `sum_loop`
+
+### List
+
+```
++557: label(p0, p1, p6, p9, f35, f30, p15, p22, p26, i32, i27, p29, descr=TargetToken(140447503809120))
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++606: i44 = uint_ge(i32, i27)
+guard_false(i44, descr=<Guard0x7fbc7b939a00>) [p0, p6, p9, p15, p1, i32, i27, i44, p26, f30, f35]
++615: f45 = getarrayitem_gc_f(p29, i32, descr=<ArrayF 8>)
++622: i47 = int_add(i32, 1)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#12 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#14 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#16 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#18 INPLACE_ADD')
++626: f48 = float_add(f35, f45)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#20 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#22 JUMP_ABSOLUTE')
++630: setfield_gc(p15, i47, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
++634: guard_not_invalidated(descr=<Guard0x7fbc7b939a60>) [p0, p6, p9, p15, p1, f45, f48, None, None]
++634: i51 = getfield_raw_i(140447672379264, descr=<FieldS pypysig_long_struct.c_value 0>)
++647: i53 = int_sub(i51, 1)
++651: setfield_raw(140447672379264, i53, descr=<FieldS pypysig_long_struct.c_value 0>)
++654: i56 = int_lt(i53, 0)
++658: guard_false(i56, descr=<Guard0x7fbc7b939ac0>) [p0, p6, p9, p15, p1, i53, f45, f48, None, None]
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++664: i57 = arraylen_gc(p29, descr=<ArrayF 8>)
++664: jump(p0, p1, p6, p9, f48, f45, p15, p22, p26, i47, i27, p29, descr=TargetToken(140447503809120))
+```
+
+### piconumpy purepy (based on list)
+
+```
++705: label(p0, p1, p6, p9, f53, f46, p15, p22, i49, p29, p38, p42, i43, p45, descr=TargetToken(139748702723776))
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++760: guard_not_invalidated(descr=<Guard0x7f19c7c97d60>) [p0, p6, p9, p15, p1, p22, i49, f46, f53]
++760: p62 = force_token()
++760: enter_portal_frame(21, 28364)
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#0 LOAD_FAST')
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#2 LOAD_ATTR')
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#4 LOAD_FAST')
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#6 BINARY_SUBSCR')
++760: i65 = uint_ge(i49, i43)
++763: guard_false(i65, descr=<Guard0x7f19ba0b44a0>) [p0, p6, p9, p15, p1, p22, i49, f46, f53]
++769: f66 = getarrayitem_gc_f(p45, i49, descr=<ArrayF 8>)
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#8 RETURN_VALUE')
++776: leave_portal_frame(21)
++776: i69 = int_add(i49, 1)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#12 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#14 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#16 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#18 INPLACE_ADD')
++780: f70 = float_add(f53, f66)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#20 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#22 JUMP_ABSOLUTE')
++784: i72 = getfield_raw_i(139748871243648, descr=<FieldS pypysig_long_struct.c_value 0>)
++797: i74 = int_sub(i72, 3)
++801: setfield_raw(139748871243648, i74, descr=<FieldS pypysig_long_struct.c_value 0>)
++804: setfield_gc(p15, i69, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
++808: i77 = int_lt(i74, 0)
++812: guard_false(i77, descr=<Guard0x7f19c7c97dc0>) [p0, p6, p9, p15, p1, i74, f66, f70, None, None, None]
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++818: i78 = arraylen_gc(p45, descr=<ArrayF 8>)
++818: jump(p0, p1, p6, p9, f70, f66, p15, p22, i69, p29, p38, p42, i43, p45, descr=TargetToken(139748702723776))
+```
+
+### piconumpy hpy
+
+```
++1339: label(p0, p1, p6, p9, f73, p63, p15, i68, p62, descr=TargetToken(139865876151520))
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++1352: p82 = getfield_gc_r(p15, descr=<FieldP pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_w_seq 16>)
++1356: guard_nonnull_class(p82, 139866025815200, descr=<Guard0x7f350fde1a60>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1376: p84 = getfield_gc_r(p82, descr=<FieldP pypy.interpreter.typedef.W_HPyObjectUserDictWeakrefable.inst_map 16>)
++1387: guard_value(p84, ConstPtr(ptr85), descr=<Guard0x7f35021fe0b0>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1396: guard_not_invalidated(descr=<Guard0x7f35021fe0f8>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1403: p87 = getfield_gc_r(ConstPtr(ptr86), descr=<FieldP pypy.module._hpy_universal.interp_slot.W_SlotWrapper.inst_w_objclass 32>)
++1414: guard_value(p87, ConstPtr(ptr88), descr=<Guard0x7f350fde1ac0>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1423: i90 = getfield_gc_i(ConstPtr(ptr89), descr=<FieldU pypy.module._hpy_universal.interp_slot.W_SlotWrapper.inst_cfuncptr 8>)
++1427: i92 = int_lt(i68, 0)
++1431: guard_false(i92, descr=<Guard0x7f35021fe140>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1444: i94 = getfield_gc_i(ConstPtr(ptr93), descr=<FieldS list.length 8>)
++1448: i95 = int_is_zero(i94)
++1451: guard_false(i95, descr=<Guard0x7f35021fe188>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1457: i97 = int_sub(i94, 1)
++1461: p99 = getfield_gc_r(ConstPtr(ptr98), descr=<FieldP list.items 16>)
++1465: i100 = getarrayitem_gc_i(p99, i97, descr=<ArrayS 8>)
++1470: i101 = arraylen_gc(p99, descr=<ArrayS 8>)
++1474: i103 = int_rshift(i101, 1)
++1477: i105 = int_sub(i103, 5)
++1481: i106 = int_lt(i97, i105)
++1484: cond_call(i106, ConstClass(_ll_list_resize_hint_really_look_inside_iff__listPtr_Signed_Bool), ConstPtr(ptr108), i97, 0, descr=<Callv 0 rii EF=5>)
++1490: guard_no_exception(descr=<Guard0x7f350fde1b20>) [p0, p6, p9, p63, p15, p1, i68, i90, i100, p82, i97, f73]
++1490: setfield_gc(ConstPtr(ptr110), i97, descr=<FieldS list.length 8>)
++1494: i112 = int_lt(i100, 0)
++1498: guard_false(i112, descr=<Guard0x7f350fde1b80>) [p0, p6, p9, p63, p15, p1, i68, i90, i100, p82, f73]
++1522: setarrayitem_gc(p62, i100, p82, descr=<ArrayP 8>)
++1527: p113 = force_token()
++1548: setfield_gc(p0, p113, descr=<FieldP pypy.interpreter.pyframe.PyFrame.vable_token 8>)
++1552: i115 = call_may_force_i(i90, 139866044538144, i100, i68, descr=<Calli 8 iii EF=7>)
++1663: guard_not_forced(descr=<Guard0x7f350fdfabe8>) [p0, p6, p9, p63, p15, p1, i100, i115, i68, p82, f73]
++1674: guard_no_exception(descr=<Guard0x7f35021fe1d0>) [p0, p6, p9, p63, p15, p1, i100, i115, i68, p82, f73]
++1688: call_n(ConstClass(close), i100, descr=<Callv 0 i EF=5>)
++1754: guard_no_exception(descr=<Guard0x7f350fde1be0>) [p0, p6, p9, p63, p15, p1, i115, i68, p82, f73]
++1768: i117 = int_is_true(i115)
++1771: guard_true(i117, descr=<Guard0x7f35021fe218>) [p0, p6, p9, p63, p15, p1, i115, i68, p82, f73]
++1784: p119 = getfield_gc_r(ConstPtr(ptr118), descr=<FieldP list.items 16>)
++1788: p120 = getarrayitem_gc_r(p119, i115, descr=<ArrayP 8>)
++1793: call_n(ConstClass(close), i115, descr=<Callv 0 i EF=5>)
++1866: guard_no_exception(descr=<Guard0x7f350fde1c40>) [p0, p6, p9, p63, p15, p1, p120, i68, p82, f73]
++1880: guard_nonnull_class(p120, ConstClass(W_FloatObject), descr=<Guard0x7f350fde1ca0>) [p0, p6, p9, p63, p15, p1, p120, i68, p82, f73]
++1907: i123 = getfield_gc_i(p15, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
++1918: i125 = int_add(i123, 1)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#12 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#14 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#16 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#18 INPLACE_ADD')
++1923: setfield_gc(p15, i125, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
++1927: f126 = getfield_gc_f(p120, descr=<FieldF pypy.objspace.std.floatobject.W_FloatObject.inst_floatval 8 pure>)
++1933: f127 = float_add(f73, f126)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#20 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#22 JUMP_ABSOLUTE')
++1947: guard_not_invalidated(descr=<Guard0x7f350fde1d00>) [p0, p6, p9, p120, p15, p1, f127, None, None, None]
++1947: i129 = getfield_raw_i(139866044675968, descr=<FieldS pypysig_long_struct.c_value 0>)
++1960: i131 = int_sub(i129, 3)
++1964: setfield_raw(139866044675968, i131, descr=<FieldS pypysig_long_struct.c_value 0>)
++1967: i134 = int_lt(i131, 0)
++1971: guard_false(i134, descr=<Guard0x7f350fde1d60>) [p0, p6, p9, p120, p15, p1, i131, f127, None, None, None]
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++1977: i135 = arraylen_gc(p119, descr=<ArrayP 8>)
++1977: jump(p0, p1, p6, p9, f127, p120, p15, i125, p119, descr=TargetToken(139865876151520))
+```

From 3e379acf854cc1a5f0355de74aea4934f934f05b Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Fri, 10 Dec 2021 10:48:40 +0100
Subject: [PATCH 18/32] Cosmetic changes output microbench

---
 bench/microbench_low_level/bench.py            |  3 +++
 bench/microbench_low_level/result_cort.md      | 18 +++++++++---------
 .../microbench_low_level/result_init_zeros.md  | 18 +++++++++---------
 bench/microbench_low_level/result_sum_loop.md  | 18 +++++++++---------
 .../result_sum_loop_index.md                   | 18 +++++++++---------
 5 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/bench/microbench_low_level/bench.py b/bench/microbench_low_level/bench.py
index 305c917..5ab4a21 100644
--- a/bench/microbench_low_level/bench.py
+++ b/bench/microbench_low_level/bench.py
@@ -85,6 +85,9 @@ def cort(arr):
     if "piconumpy" not in method:
         method = f"piconumpy.{method}"
 
+if "_piconumpy_" in method:
+    method = method.replace("_piconumpy_", "piconumpy.")
+
 size = 10000
 
 # warming during ~ 1s
diff --git a/bench/microbench_low_level/result_cort.md b/bench/microbench_low_level/result_cort.md
index 4b8ff6c..b5578bf 100644
--- a/bench/microbench_low_level/result_cort.md
+++ b/bench/microbench_low_level/result_cort.md
@@ -29,10 +29,10 @@ hostname: voyage
 {'cache_tag': 'pypy37',
  'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
 list                          : 4.29e-05 s (  1.8 * Julia)
-purepy                        : 4.12e-05 s (  1.7 * Julia)
+piconumpy.purepy              : 4.12e-05 s (  1.7 * Julia)
 numpy                         : 4.77e-02 s (1975.5 * Julia)
-_piconumpy_hpy                : 1.46e-03 s ( 60.5 * Julia)
-_piconumpy_cpython_capi       : 6.96e-03 s (288.5 * Julia)
+piconumpy.hpy                 : 1.46e-03 s ( 60.5 * Julia)
+piconumpy.cpython_capi        : 6.96e-03 s (288.5 * Julia)
 ```
 
 With CPython:
@@ -43,10 +43,10 @@ hostname: voyage
 {'cache_tag': 'cpython-39',
  'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
 list                          : 4.42e-03 s (183.4 * Julia)
-purepy                        : 1.04e-02 s (430.0 * Julia)
+piconumpy.purepy              : 1.04e-02 s (430.0 * Julia)
 numpy                         : 9.76e-03 s (404.4 * Julia)
-_piconumpy_hpy                : 5.66e-03 s (234.7 * Julia)
-_piconumpy_cpython_capi       : 4.77e-03 s (197.7 * Julia)
+piconumpy.hpy                 : 5.66e-03 s (234.7 * Julia)
+piconumpy.cpython_capi        : 4.77e-03 s (197.7 * Julia)
 ```
 
 With Python 3.8.5 (GraalVM CE Native 21.3.0)
@@ -57,8 +57,8 @@ hostname: voyage
 {'cache_tag': 'graalpython-38',
  'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
 list                          : 2.44e-05 s (  1.0 * Julia)
-purepy                        : 3.13e-05 s (  1.3 * Julia)
+piconumpy.purepy              : 3.13e-05 s (  1.3 * Julia)
 numpy                         : ImportError numpy
-_piconumpy_hpy                : 1.69e-04 s (  7.0 * Julia)
-_piconumpy_cpython_capi       : 3.55e-04 s ( 14.7 * Julia)
+piconumpy.hpy                 : 1.69e-04 s (  7.0 * Julia)
+piconumpy.cpython_capi        : 3.55e-04 s ( 14.7 * Julia)
 ```
diff --git a/bench/microbench_low_level/result_init_zeros.md b/bench/microbench_low_level/result_init_zeros.md
index 68eee34..b88e4bd 100644
--- a/bench/microbench_low_level/result_init_zeros.md
+++ b/bench/microbench_low_level/result_init_zeros.md
@@ -18,10 +18,10 @@ hostname: voyage
 {'cache_tag': 'pypy37',
  'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
 list                          : 2.63e-05 s (  5.4 * Julia)
-purepy                        : 2.99e-05 s (  6.1 * Julia)
+piconumpy.purepy              : 2.99e-05 s (  6.1 * Julia)
 numpy                         : 1.17e-02 s (2403.5 * Julia)
-_piconumpy_hpy                : 4.58e-04 s ( 94.1 * Julia)
-_piconumpy_cpython_capi       : 8.46e-04 s (173.6 * Julia)
+piconumpy.hpy                 : 4.58e-04 s ( 94.1 * Julia)
+piconumpy.cpython_capi        : 8.46e-04 s (173.6 * Julia)
 ```
 
 With CPython:
@@ -32,10 +32,10 @@ hostname: voyage
 {'cache_tag': 'cpython-39',
  'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
 list                          : 5.34e-04 s (109.6 * Julia)
-purepy                        : 2.03e-03 s (417.4 * Julia)
+piconumpy.purepy              : 2.03e-03 s (417.4 * Julia)
 numpy                         : 1.17e-03 s (239.3 * Julia)
-_piconumpy_hpy                : 7.51e-04 s (154.1 * Julia)
-_piconumpy_cpython_capi       : 5.44e-04 s (111.5 * Julia)
+piconumpy.hpy                 : 7.51e-04 s (154.1 * Julia)
+piconumpy.cpython_capi        : 5.44e-04 s (111.5 * Julia)
 ```
 
 With Python 3.8.5 (GraalVM CE Native 21.3.0)
@@ -46,8 +46,8 @@ hostname: voyage
 {'cache_tag': 'graalpython-38',
  'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
 list                          : 1.37e-05 s (  2.8 * Julia)
-purepy                        : 1.93e-05 s (  4.0 * Julia)
+piconumpy.purepy              : 1.93e-05 s (  4.0 * Julia)
 numpy                         : ImportError numpy
-_piconumpy_hpy                : 4.68e-05 s (  9.6 * Julia)
-_piconumpy_cpython_capi       : 1.74e-04 s ( 35.8 * Julia)
+piconumpy.hpy                 : 4.68e-05 s (  9.6 * Julia)
+piconumpy.cpython_capi        : 1.74e-04 s ( 35.8 * Julia)
 ```
diff --git a/bench/microbench_low_level/result_sum_loop.md b/bench/microbench_low_level/result_sum_loop.md
index 48e32f2..b415051 100644
--- a/bench/microbench_low_level/result_sum_loop.md
+++ b/bench/microbench_low_level/result_sum_loop.md
@@ -20,10 +20,10 @@ hostname: voyage
 {'cache_tag': 'pypy37',
  'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
 list                          : 2.35e-05 s (  1.8 * Julia)
-purepy                        : 2.60e-05 s (  2.0 * Julia)
+piconumpy.purepy              : 2.60e-05 s (  2.0 * Julia)
 numpy                         : 8.97e-03 s (677.0 * Julia)
-_piconumpy_hpy                : 3.73e-04 s ( 28.2 * Julia)
-_piconumpy_cpython_capi       : 1.75e-03 s (132.4 * Julia)
+piconumpy.hpy                 : 3.73e-04 s ( 28.2 * Julia)
+piconumpy.cpython_capi        : 1.75e-03 s (132.4 * Julia)
 ```
 
 With CPython:
@@ -34,10 +34,10 @@ hostname: voyage
 {'cache_tag': 'cpython-39',
  'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
 list                          : 3.65e-04 s ( 27.5 * Julia)
-purepy                        : 2.17e-03 s (164.1 * Julia)
+piconumpy.purepy              : 2.17e-03 s (164.1 * Julia)
 numpy                         : 1.09e-03 s ( 82.2 * Julia)
-_piconumpy_hpy                : 7.39e-04 s ( 55.8 * Julia)
-_piconumpy_cpython_capi       : 5.07e-04 s ( 38.3 * Julia)
+piconumpy.hpy                 : 7.39e-04 s ( 55.8 * Julia)
+piconumpy.cpython_capi        : 5.07e-04 s ( 38.3 * Julia)
 
 ```
 
@@ -49,10 +49,10 @@ hostname: voyage
 {'cache_tag': 'graalpython-38',
  'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
 list                          : 1.92e-05 s (  1.4 * Julia)
-purepy                        : 3.61e-05 s (  2.7 * Julia)
+piconumpy.purepy              : 3.61e-05 s (  2.7 * Julia)
 numpy                         : ImportError numpy
-_piconumpy_hpy                : 5.03e-04 s ( 38.0 * Julia)
-_piconumpy_cpython_capi       : 2.90e-03 s (219.1 * Julia)
+piconumpy.hpy                 : 5.03e-04 s ( 38.0 * Julia)
+piconumpy.cpython_capi        : 2.90e-03 s (219.1 * Julia)
 ```
 
 ## Summary
diff --git a/bench/microbench_low_level/result_sum_loop_index.md b/bench/microbench_low_level/result_sum_loop_index.md
index 4a56d2b..fd63301 100644
--- a/bench/microbench_low_level/result_sum_loop_index.md
+++ b/bench/microbench_low_level/result_sum_loop_index.md
@@ -20,10 +20,10 @@ hostname: voyage
 {'cache_tag': 'pypy37',
  'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
 list                          : 1.19e-05 s (  2.0 * Julia)
-purepy                        : 1.64e-05 s (  2.8 * Julia)
+piconumpy.purepy              : 1.64e-05 s (  2.8 * Julia)
 numpy                         : 4.18e-03 s (711.4 * Julia)
-_piconumpy_hpy                : 1.73e-04 s ( 29.4 * Julia)
-_piconumpy_cpython_capi       : 8.44e-04 s (143.8 * Julia)
+piconumpy.hpy                 : 1.73e-04 s ( 29.4 * Julia)
+piconumpy.cpython_capi        : 8.44e-04 s (143.8 * Julia)
 ```
 
 With CPython:
@@ -34,10 +34,10 @@ hostname: voyage
 {'cache_tag': 'cpython-39',
  'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
 list                          : 3.91e-04 s ( 66.5 * Julia)
-purepy                        : 1.11e-03 s (188.3 * Julia)
+piconumpy.purepy              : 1.11e-03 s (188.3 * Julia)
 numpy                         : 8.93e-04 s (152.1 * Julia)
-_piconumpy_hpy                : 5.42e-04 s ( 92.3 * Julia)
-_piconumpy_cpython_capi       : 4.17e-04 s ( 71.0 * Julia)
+piconumpy.hpy                 : 5.42e-04 s ( 92.3 * Julia)
+piconumpy.cpython_capi        : 4.17e-04 s ( 71.0 * Julia)
 ```
 
 With Python 3.8.5 (GraalVM CE Native 21.3.0)
@@ -48,8 +48,8 @@ hostname: voyage
 {'cache_tag': 'graalpython-38',
  'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
 list                          : 1.36e-05 s (  2.3 * Julia)
-purepy                        : 1.81e-05 s (  3.1 * Julia)
+piconumpy.purepy              : 1.81e-05 s (  3.1 * Julia)
 numpy                         : ImportError numpy
-_piconumpy_hpy                : 3.68e-05 s (  6.3 * Julia)
-_piconumpy_cpython_capi       : 1.08e-04 s ( 18.5 * Julia)
+piconumpy.hpy                 : 3.68e-05 s (  6.3 * Julia)
+piconumpy.cpython_capi        : 1.08e-04 s ( 18.5 * Julia)
 ```

From 9d7f176a8d9e9c0f969c9506bd192b055529eb22 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Fri, 10 Dec 2021 14:36:37 +0100
Subject: [PATCH 19/32] Add "board" benchmark

---
 bench/microbench_low_level/Makefile           |  7 ++
 bench/microbench_low_level/README.md          |  3 +
 bench/microbench_low_level/bench.py           | 76 ++++++++++++------
 bench/microbench_low_level/bench_board.jl     | 44 +++++++++++
 bench/microbench_low_level/result_board.md    | 77 +++++++++++++++++++
 bench/microbench_low_level/result_sum_loop.md |  1 -
 6 files changed, 184 insertions(+), 24 deletions(-)
 create mode 100644 bench/microbench_low_level/bench_board.jl
 create mode 100644 bench/microbench_low_level/result_board.md

diff --git a/bench/microbench_low_level/Makefile b/bench/microbench_low_level/Makefile
index 256dc90..15c96e3 100644
--- a/bench/microbench_low_level/Makefile
+++ b/bench/microbench_low_level/Makefile
@@ -13,6 +13,9 @@ bench_cort: tmp_julia_cort.txt _bench
 bench_init_zeros: NAME_BENCH=init_zeros
 bench_init_zeros: tmp_julia_init_zeros.txt _bench
 
+bench_board: NAME_BENCH=board
+bench_board: tmp_julia_board.txt _bench
+
 _bench:
 	@echo bench $(NAME_BENCH)
 	@python -c "from socket import gethostname as f; print('hostname:', f())"
@@ -35,6 +38,10 @@ tmp_julia_cort.txt: bench_cort.jl
 tmp_julia_init_zeros.txt: bench_init_zeros.jl
 	@julia bench_init_zeros.jl > tmp_julia_init_zeros.txt
 
+tmp_julia_board.txt: bench_board.jl
+	@julia bench_board.jl > tmp_julia_board.txt
+
+
 clean:
 	rm -f tmp_*.txt
 
diff --git a/bench/microbench_low_level/README.md b/bench/microbench_low_level/README.md
index b8778f2..e017263 100644
--- a/bench/microbench_low_level/README.md
+++ b/bench/microbench_low_level/README.md
@@ -12,4 +12,7 @@ We measure the performance for functions containing low level Python code.
 - `cort` (command `make bench_cort`): normalized cosine similarity measure
   between derivatives
 
+- `board` (command `make bench_board`): few indexing, simple float computations
+  with sin/cos and instantiation of a small array.
+
 The files result_*.txt contain few results.
diff --git a/bench/microbench_low_level/bench.py b/bench/microbench_low_level/bench.py
index 5ab4a21..570d26a 100644
--- a/bench/microbench_low_level/bench.py
+++ b/bench/microbench_low_level/bench.py
@@ -2,7 +2,7 @@
 from time import perf_counter
 from pathlib import Path
 from random import random
-from math import sqrt
+from math import sqrt, pi, sin, cos
 
 try:
     method = sys.argv[1]
@@ -15,6 +15,33 @@
     name_bench = "sum_loop"
 
 
+if method == "_piconumpy_hpy":
+    from piconumpy.util_hpy import import_ext
+
+    ext = import_ext()
+    array = ext.array
+elif method == "list":
+    array = list
+elif method == "numpy":
+
+    try:
+        import numpy as np
+    except ImportError:
+        print(f"{method:30s}: ImportError numpy")
+        sys.exit(0)
+
+    array = np.array
+else:
+    d = {}
+    exec(f"from piconumpy.{method} import array", d)
+    array = d["array"]
+    if "piconumpy" not in method:
+        method = f"piconumpy.{method}"
+
+if "_piconumpy_" in method:
+    method = method.replace("_piconumpy_", "piconumpy.")
+
+
 tmp_result_julia = Path(f"tmp_julia_{name_bench}.txt")
 if tmp_result_julia.exists():
     with open(tmp_result_julia) as file:
@@ -59,34 +86,37 @@ def cort(arr):
     return _cort(arr, arr)
 
 
-compute_from_arr = locals()[name_bench]
+def board(X_0):
+    x0 = X_0[0]
+    y0 = X_0[1]
+    u0 = X_0[2]
+    v0 = X_0[3]
 
+    g = 9.81
+    b = 0.5
+    a = 0.25
+    c = 0.5
+    p = (2 * pi) / 10.0
+    q = (2 * pi) / 4.0
 
-if method == "_piconumpy_hpy":
-    from piconumpy.util_hpy import import_ext
+    H_x = -a + b * p * sin(p * x0) * cos(q * y0)
+    H_xx = b * p ** 2 * cos(p * x0) * cos(q * y0)
+    H_y = b * q * cos(p * x0) * sin(q * y0)
+    H_yy = b * q ** 2 * cos(p * x0) * cos(q * y0)
+    H_xy = -b * q * p * sin(p * x0) * sin(q * y0)
 
-    ext = import_ext()
-    array = ext.array
-elif method == "list":
-    array = list
-elif method == "numpy":
+    F = (g + H_xx * u0 ** 2 + 2 * H_xy * u0 * v0 + H_yy * v0 ** 2) / (
+        1 + H_x ** 2 + H_y ** 2
+    )
 
-    try:
-        import numpy as np
-    except ImportError:
-        print(f"{method:30s}: ImportError numpy")
-        sys.exit(0)
+    dU = -F * H_x - c * u0
+    dV = -F * H_y - c * v0
 
-    array = np.array
-else:
-    d = {}
-    exec(f"from piconumpy.{method} import array", d)
-    array = d["array"]
-    if "piconumpy" not in method:
-        method = f"piconumpy.{method}"
+    return array([u0, v0, dU, dV])
+
+
+compute_from_arr = locals()[name_bench]
 
-if "_piconumpy_" in method:
-    method = method.replace("_piconumpy_", "piconumpy.")
 
 size = 10000
 
diff --git a/bench/microbench_low_level/bench_board.jl b/bench/microbench_low_level/bench_board.jl
new file mode 100644
index 0000000..63187b1
--- /dev/null
+++ b/bench/microbench_low_level/bench_board.jl
@@ -0,0 +1,44 @@
+using Statistics
+
+function board(X_0::Array)
+
+    x0 = copy(X_0[1])
+    y0 = copy(X_0[2])
+    u0 = copy(X_0[3])
+    v0 = copy(X_0[4])
+
+    g = 9.81
+    a = 0.25
+    b = 0.5
+    c = 0.5
+    p = (2*π)/10.0
+    q = (2*π)/4.0
+
+    H_x = -a + b*p*sin(p*x0)*cos(q*y0)
+    H_xx = b*p^2 * cos(p*x0)*cos(q*y0)
+    H_y = b*q*cos(p*x0)*sin(q*y0)
+    H_yy = b*q^2 * cos(p*x0)*cos(q*y0)
+    H_xy = -b*q*p*sin(p*x0)*sin(q*y0)
+
+    F = (g + H_xx*u0^2 + 2*H_xy*u0*v0 + H_yy*v0^2)/(1 + H_x^2 + H_y^2)
+
+    dU = -F*H_x - c*u0
+    dV = -F*H_y - c*v0
+
+    return [u0, v0, dU, dV]
+
+end
+
+compute_from_arr = board
+
+size = 10000
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))
diff --git a/bench/microbench_low_level/result_board.md b/bench/microbench_low_level/result_board.md
new file mode 100644
index 0000000..30b407b
--- /dev/null
+++ b/bench/microbench_low_level/result_board.md
@@ -0,0 +1,77 @@
+# Microbenchmark board
+
+We measure the performance for this function:
+
+```python
+def board(X_0):
+    x0 = X_0[0]
+    y0 = X_0[1]
+    u0 = X_0[2]
+    v0 = X_0[3]
+
+    g = 9.81
+    b = 0.5
+    a = 0.25
+    c = 0.5
+    p = (2 * pi) / 10.0
+    q = (2 * pi) / 4.0
+
+    H_x = -a + b * p * sin(p * x0) * cos(q * y0)
+    H_xx = b * p ** 2 * cos(p * x0) * cos(q * y0)
+    H_y = b * q * cos(p * x0) * sin(q * y0)
+    H_yy = b * q ** 2 * cos(p * x0) * cos(q * y0)
+    H_xy = -b * q * p * sin(p * x0) * sin(q * y0)
+
+    F = (g + H_xx * u0 ** 2 + 2 * H_xy * u0 * v0 + H_yy * v0 ** 2) / (
+        1 + H_x ** 2 + H_y ** 2
+    )
+
+    dU = -F * H_x - c * u0
+    dV = -F * H_y - c * v0
+
+    return array([u0, v0, dU, dV])
+```
+
+One can run the benchmarks with `make bench_board`.
+
+With PyPy3.7, I get:
+
+```
+bench board
+hostname: voyage
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
+list                          : 3.21e-07 s (  0.9 * Julia)
+piconumpy.purepy              : 1.37e-05 s ( 36.9 * Julia)
+numpy                         : 1.18e-04 s (316.6 * Julia)
+piconumpy.hpy                 : 1.26e-05 s ( 33.8 * Julia)
+piconumpy.cpython_capi        : 5.52e-05 s (148.6 * Julia)
+```
+
+With CPython:
+
+```
+bench board
+hostname: voyage
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
+list                          : 5.16e-06 s ( 13.9 * Julia)
+piconumpy.purepy              : 8.04e-06 s ( 21.6 * Julia)
+numpy                         : 1.01e-05 s ( 27.1 * Julia)
+piconumpy.hpy                 : 5.90e-06 s ( 15.9 * Julia)
+piconumpy.cpython_capi        : 5.56e-06 s ( 15.0 * Julia)
+```
+
+With Python 3.8.5 (GraalVM CE Native 21.3.0)
+
+```
+bench board
+hostname: voyage
+{'cache_tag': 'graalpython-38',
+ 'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
+list                          : 1.15e-05 s ( 30.9 * Julia)
+piconumpy.purepy              : 1.74e-05 s ( 46.8 * Julia)
+numpy                         : ImportError numpy
+piconumpy.hpy                 : 4.91e-05 s (132.2 * Julia)
+piconumpy.cpython_capi        : 6.19e-05 s (166.7 * Julia)
+```
diff --git a/bench/microbench_low_level/result_sum_loop.md b/bench/microbench_low_level/result_sum_loop.md
index b415051..29d9b55 100644
--- a/bench/microbench_low_level/result_sum_loop.md
+++ b/bench/microbench_low_level/result_sum_loop.md
@@ -38,7 +38,6 @@ piconumpy.purepy              : 2.17e-03 s (164.1 * Julia)
 numpy                         : 1.09e-03 s ( 82.2 * Julia)
 piconumpy.hpy                 : 7.39e-04 s ( 55.8 * Julia)
 piconumpy.cpython_capi        : 5.07e-04 s ( 38.3 * Julia)
-
 ```
 
 With Python 3.8.5 (GraalVM CE Native 21.3.0)

From 34ad4c1301415b2d4212f3e21e7f41355281765f Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Mon, 13 Dec 2021 21:16:29 +0100
Subject: [PATCH 20/32] Converting floats to floats is expensive and useless

---
 piconumpy/purepy.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/piconumpy/purepy.py b/piconumpy/purepy.py
index a84ad31..2ebd85c 100644
--- a/piconumpy/purepy.py
+++ b/piconumpy/purepy.py
@@ -2,7 +2,7 @@ class array:
     __slots__ = ["data", "size"]
 
     def __init__(self, data):
-        self.data = list(float(number) for number in data)
+        self.data = list(data)
         self.size = len(self.data)
 
     def __add__(self, other):
@@ -35,4 +35,3 @@ def empty(size):
 
 def zeros(size):
     return array([0]*size)
-

From d172daa732d40f54962505b6965c1ad430ca349b Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Mon, 13 Dec 2021 21:23:01 +0100
Subject: [PATCH 21/32] Log and stability microbench

---
 bench/microbench_low_level/bench.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/bench/microbench_low_level/bench.py b/bench/microbench_low_level/bench.py
index 570d26a..6e71ba5 100644
--- a/bench/microbench_low_level/bench.py
+++ b/bench/microbench_low_level/bench.py
@@ -120,6 +120,8 @@ def board(X_0):
 
 size = 10000
 
+print(f"{method:30s}:", end="", flush=True)
+
 # warming during ~ 1s
 data_as_list = [random() for _ in range(size)]
 arr = array(data_as_list)
@@ -133,10 +135,10 @@ def median(sequence):
     return tmp[len(tmp) // 2]
 
 
-# measure during ~ 2s
+# measure during ~ 4s
 t0 = perf_counter()
 times = []
-while perf_counter() - t0 < 2.0:
+while perf_counter() - t0 < 4.0:
     data_as_list = [random() for _ in range(size)]
     arr = array(data_as_list)
     t_start = perf_counter()
@@ -144,4 +146,4 @@ def median(sequence):
     times.append(perf_counter() - t_start)
 
 time = median(times)
-print(f"{method:30s}: {time:.2e} s ({time / norm:5.1f} * Julia)")
+print(f" {time:.2e} s ({time / norm:5.1f} * Julia)")

From 8472fbdeb23d0f0d70d3d8bd4025461d0344b5ad Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Wed, 15 Dec 2021 12:06:15 +0100
Subject: [PATCH 22/32] Add microbench instantiate

---
 .gitignore                                    |  5 +-
 bench/microbench_low_level/Makefile           | 57 +++++++++----------
 bench/microbench_low_level/README.md          |  3 +
 bench/microbench_low_level/bench.py           | 13 ++++-
 .../{ => julia}/bench_board.jl                |  0
 .../{ => julia}/bench_cort.jl                 |  0
 .../{ => julia}/bench_init_zeros.jl           |  0
 .../julia/bench_instantiate.jl                | 22 +++++++
 .../{ => julia}/bench_sum_loop.jl             |  0
 .../{ => julia}/bench_sum_loop_index.jl       |  0
 .../microbench_low_level/result_initialize.md | 55 ++++++++++++++++++
 11 files changed, 121 insertions(+), 34 deletions(-)
 rename bench/microbench_low_level/{ => julia}/bench_board.jl (100%)
 rename bench/microbench_low_level/{ => julia}/bench_cort.jl (100%)
 rename bench/microbench_low_level/{ => julia}/bench_init_zeros.jl (100%)
 create mode 100644 bench/microbench_low_level/julia/bench_instantiate.jl
 rename bench/microbench_low_level/{ => julia}/bench_sum_loop.jl (100%)
 rename bench/microbench_low_level/{ => julia}/bench_sum_loop_index.jl (100%)
 create mode 100644 bench/microbench_low_level/result_initialize.md

diff --git a/.gitignore b/.gitignore
index 9a709bb..8e8015b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,5 +9,8 @@ build
 
 **/tmp*.*
 **/tmp*.*
+**/tmp/*
 
-*_cython.c
\ No newline at end of file
+*_cython.c
+
+piconumpy/_piconumpy_hpy.py
\ No newline at end of file
diff --git a/bench/microbench_low_level/Makefile b/bench/microbench_low_level/Makefile
index 15c96e3..1734cbe 100644
--- a/bench/microbench_low_level/Makefile
+++ b/bench/microbench_low_level/Makefile
@@ -1,51 +1,46 @@
 
-.PHONY : clean bench_sum_loop bench_sum_loop_index bench_cort bench_init_zeros
+IMPLEMENTATION=$(shell python -c 'import sys; print(sys.implementation.cache_tag)')
+
+.PHONY : clean bench_sum_loop bench_sum_loop_index bench_cort bench_init_zeros bench_instantiate
 
 bench_sum_loop: NAME_BENCH=sum_loop
-bench_sum_loop: tmp_julia_sum_loop.txt _bench
+bench_sum_loop: tmp/sum_loop_julia.txt _bench
 
 bench_sum_loop_index: NAME_BENCH=sum_loop_index
-bench_sum_loop_index: tmp_julia_sum_loop_index.txt _bench
+bench_sum_loop_index: tmp/sum_loop_index_julia.txt _bench
 
 bench_cort: NAME_BENCH=cort
-bench_cort: tmp_julia_cort.txt _bench
+bench_cort: tmp/cort_julia.txt _bench
 
 bench_init_zeros: NAME_BENCH=init_zeros
-bench_init_zeros: tmp_julia_init_zeros.txt _bench
+bench_init_zeros: tmp/init_zeros_julia.txt _bench
 
 bench_board: NAME_BENCH=board
-bench_board: tmp_julia_board.txt _bench
+bench_board: tmp/board_julia.txt _bench
+
+bench_instantiate: NAME_BENCH=instantiate
+bench_instantiate: tmp/instantiate_julia.txt _bench
+
 
 _bench:
 	@echo bench $(NAME_BENCH)
 	@python -c "from socket import gethostname as f; print('hostname:', f())"
 	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
-	@python bench.py list $(NAME_BENCH)
-	@python bench.py purepy $(NAME_BENCH)
-	@python bench.py numpy $(NAME_BENCH)
-	@python bench.py _piconumpy_hpy $(NAME_BENCH)
-	@python bench.py _piconumpy_cpython_capi $(NAME_BENCH)
-
-tmp_julia_sum_loop.txt: bench_sum_loop.jl
-	@julia bench_sum_loop.jl > tmp_julia_sum_loop.txt
-
-tmp_julia_sum_loop_index.txt: bench_sum_loop_index.jl
-	@julia bench_sum_loop_index.jl > tmp_julia_sum_loop_index.txt
-
-tmp_julia_cort.txt: bench_cort.jl
-	@julia bench_cort.jl > tmp_julia_cort.txt
-
-tmp_julia_init_zeros.txt: bench_init_zeros.jl
-	@julia bench_init_zeros.jl > tmp_julia_init_zeros.txt
-
-tmp_julia_board.txt: bench_board.jl
-	@julia bench_board.jl > tmp_julia_board.txt
+	@python bench.py list $(NAME_BENCH) | tee tmp/$(NAME_BENCH)_$(IMPLEMENTATION)_list.txt
+	@python bench.py purepy $(NAME_BENCH) | tee tmp/$(NAME_BENCH)_$(IMPLEMENTATION)_purepy.txt
+	@python bench.py numpy $(NAME_BENCH) | tee tmp/$(NAME_BENCH)_$(IMPLEMENTATION)_numpy.txt
+	@python bench.py _piconumpy_hpy $(NAME_BENCH) | tee tmp/$(NAME_BENCH)_$(IMPLEMENTATION)_hpy.txt
+	@python bench.py _piconumpy_cpython_capi $(NAME_BENCH) | tee tmp/$(NAME_BENCH)_$(IMPLEMENTATION)_cpy_api.txt
 
+tmp/%_julia.txt: julia/bench_%.jl
+	@mkdir -p tmp
+	@julia julia/bench_$*.jl > $@
 
 clean:
-	rm -f tmp_*.txt
+	rm -rf tmp
 
-produce_traces: tmp_julia_sum_loop.txt
-	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp_traces_list.txt pypy bench.py list
-	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp_traces_piconumpy_list.txt pypy bench.py purepy
-	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp_traces_piconumpy_hpy.txt pypy bench.py _piconumpy_hpy
+produce_traces: tmp/sum_loop_julia.txt
+	@mkdir -p tmp
+	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp/pypylog_list.txt pypy bench.py list
+	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp/pypylog_piconumpy_list.txt pypy bench.py purepy
+	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp/pypylog_piconumpy_hpy.txt pypy bench.py _piconumpy_hpy
diff --git a/bench/microbench_low_level/README.md b/bench/microbench_low_level/README.md
index e017263..f62e564 100644
--- a/bench/microbench_low_level/README.md
+++ b/bench/microbench_low_level/README.md
@@ -15,4 +15,7 @@ We measure the performance for functions containing low level Python code.
 - `board` (command `make bench_board`): few indexing, simple float computations
   with sin/cos and instantiation of a small array.
 
+- `instantiate` (command `make bench_instantiate`): dominated by the
+  instantiation/deletion of small arrays of 4 floats.
+
 The files result_*.txt contain few results.
diff --git a/bench/microbench_low_level/bench.py b/bench/microbench_low_level/bench.py
index 6e71ba5..14a4dea 100644
--- a/bench/microbench_low_level/bench.py
+++ b/bench/microbench_low_level/bench.py
@@ -42,12 +42,14 @@
     method = method.replace("_piconumpy_", "piconumpy.")
 
 
-tmp_result_julia = Path(f"tmp_julia_{name_bench}.txt")
+tmp_result_julia = Path(f"tmp/{name_bench}_julia.txt")
 if tmp_result_julia.exists():
     with open(tmp_result_julia) as file:
         norm = float(file.read())
 else:
-    print(f"{tmp_result_julia} does not exist. First execute with `make`")
+    raise RuntimeError(
+        f"{tmp_result_julia} does not exist. First execute with `make`"
+    )
 
 
 def sum_loop(arr):
@@ -115,6 +117,13 @@ def board(X_0):
     return array([u0, v0, dU, dV])
 
 
+def instantiate(arr):
+    x = arr[0]
+    result = array([x, 3 * x, 6 * x, 9 * x])
+    result[0] = 2 * result[1]
+    return result
+
+
 compute_from_arr = locals()[name_bench]
 
 
diff --git a/bench/microbench_low_level/bench_board.jl b/bench/microbench_low_level/julia/bench_board.jl
similarity index 100%
rename from bench/microbench_low_level/bench_board.jl
rename to bench/microbench_low_level/julia/bench_board.jl
diff --git a/bench/microbench_low_level/bench_cort.jl b/bench/microbench_low_level/julia/bench_cort.jl
similarity index 100%
rename from bench/microbench_low_level/bench_cort.jl
rename to bench/microbench_low_level/julia/bench_cort.jl
diff --git a/bench/microbench_low_level/bench_init_zeros.jl b/bench/microbench_low_level/julia/bench_init_zeros.jl
similarity index 100%
rename from bench/microbench_low_level/bench_init_zeros.jl
rename to bench/microbench_low_level/julia/bench_init_zeros.jl
diff --git a/bench/microbench_low_level/julia/bench_instantiate.jl b/bench/microbench_low_level/julia/bench_instantiate.jl
new file mode 100644
index 0000000..a71cb63
--- /dev/null
+++ b/bench/microbench_low_level/julia/bench_instantiate.jl
@@ -0,0 +1,22 @@
+using Statistics
+
+function instantiate(arr::Array)
+    x = arr[1]
+    result = [x, 3*x, 6*x, 9*x]
+    result[1] = 2 * result[2]
+    return result
+end
+
+compute_from_arr = instantiate
+
+size = 10000
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))
diff --git a/bench/microbench_low_level/bench_sum_loop.jl b/bench/microbench_low_level/julia/bench_sum_loop.jl
similarity index 100%
rename from bench/microbench_low_level/bench_sum_loop.jl
rename to bench/microbench_low_level/julia/bench_sum_loop.jl
diff --git a/bench/microbench_low_level/bench_sum_loop_index.jl b/bench/microbench_low_level/julia/bench_sum_loop_index.jl
similarity index 100%
rename from bench/microbench_low_level/bench_sum_loop_index.jl
rename to bench/microbench_low_level/julia/bench_sum_loop_index.jl
diff --git a/bench/microbench_low_level/result_initialize.md b/bench/microbench_low_level/result_initialize.md
new file mode 100644
index 0000000..883cea1
--- /dev/null
+++ b/bench/microbench_low_level/result_initialize.md
@@ -0,0 +1,55 @@
+# Microbenchmark instantiate
+
+We measure the performance for this function:
+
+```python
+def instantiate(arr):
+    x = arr[0]
+    result = array([x, 3 * x, 6 * x, 9 * x])
+    result[0] = 2 * result[1]
+    return result
+```
+
+One can run the benchmarks with `make bench_instantiate`.
+
+With PyPy3.7, I get:
+
+```
+bench instantiate
+hostname: meige8pcpa79
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
+list                          : 1.13e-07 s (  0.9 * Julia)
+piconumpy.purepy              : 8.50e-08 s (  0.7 * Julia)
+numpy                         : ImportError numpy
+piconumpy.hpy                 : 1.69e-06 s ( 13.1 * Julia)
+piconumpy.cpython_capi        : 1.53e-05 s (118.3 * Julia)
+```
+
+With CPython:
+
+```
+bench instantiate
+hostname: meige8pcpa79
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
+list                          : 1.19e-06 s (  9.2 * Julia)
+piconumpy.purepy              : 2.59e-06 s ( 20.0 * Julia)
+numpy                         : 3.63e-06 s ( 28.1 * Julia)
+piconumpy.hpy                 : 1.84e-06 s ( 14.3 * Julia)
+piconumpy.cpython_capi        : 1.35e-06 s ( 10.5 * Julia)
+```
+
+With Python 3.8.5 (GraalVM CE Native 21.3.0)
+
+```
+bench instantiate
+hostname: meige8pcpa79
+{'cache_tag': 'graalpython-38',
+ 'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
+list                          : 4.16e-06 s ( 32.3 * Julia)
+piconumpy.purepy              : 4.15e-06 s ( 32.2 * Julia)
+numpy                         : ImportError numpy
+piconumpy.hpy                 : 7.32e-06 s ( 56.8 * Julia)
+piconumpy.cpython_capi        : 9.68e-06 s ( 75.0 * Julia)
+```

From 593fa754a3faf38be2f91885d8619919ac9a25b4 Mon Sep 17 00:00:00 2001
From: Pierre Augier <pierre.augier@univ-grenoble-alpes.fr>
Date: Sat, 10 May 2025 00:26:12 +0200
Subject: [PATCH 23/32] Modernize pyproject.toml and Makefile

---
 Makefile       | 14 +++++++++-----
 pyproject.toml | 14 +++++++++-----
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index 3ed0ded..ef7e508 100644
--- a/Makefile
+++ b/Makefile
@@ -4,11 +4,18 @@ PYTHON := python
 endif
 
 all:
-	make develop_universal
+	make editable_universal
 ifeq ($(PYTHON),python)
-	make build_ext
+	make editable
 endif
 
+editable:
+	$(PYTHON) -m pip install -e .
+
+editable_universal:
+	$(PYTHON) -m pip install -e . --config-settings="--global-option=--hpy-abi=universal"
+	rm -f piconumpy/_piconumpy_hpy.py
+
 develop:
 	$(PYTHON) setup.py develop
 
@@ -16,9 +23,6 @@ develop_universal:
 	$(PYTHON) setup.py --hpy-abi=universal develop
 	rm -f piconumpy/_piconumpy_hpy.py
 
-pip:
-	$(PYTHON) -m pip install -e .[dev]
-
 build_ext_universal:
 	$(PYTHON) setup.py --hpy-abi=universal build_ext -if
 
diff --git a/pyproject.toml b/pyproject.toml
index 3234fad..52670de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,10 +5,12 @@ description = "An experiment about Numpy and pyhandle/hpy."
 authors = [
     {name = "Pierre Augier", email = "pierre.augier@univ-grenoble-alpes.fr"},
 ]
-license = {text = "BSD 3-Clause"}
+license = "BSD-3-Clause"
+license-files = ["LICENSE"]
 readme = "README.md"
 keywords = ["numpy", "hpy", "PyPy"]
 requires-python = ">=3.8"
+dependencies = ["hpy>=0.9.0; implementation_name == 'cpython'"]
 
 [project.urls]
 homepage = "https://github.com/paugier/piconumpy"
@@ -16,15 +18,17 @@ repository = "https://github.com/paugier/piconumpy"
 documentation = "https://github.com/paugier/piconumpy"
 
 [project.optional-dependencies]
-dev = ['transonic', 'numpy', 'pytest', 'pythran']
-full = ['black']
+test = ["pytest", "numpy"]
+bench = ['transonic', 'numpy', 'pythran']
+format = ['black']
+full = ["piconumpy[test,bench,format]"]
 
 [build-system]
 requires = [
-    "setuptools >= 35.0.2",
+    "setuptools>=35.0.2",
     "wheel",
     "cython",
-    "hpy >= 0.9.0"
+    "hpy>=0.9.0; implementation_name == 'cpython'"
 ]
 
 [tool.black]

From f0d956f04b29666e37a9d1647fc57085de3ec82a Mon Sep 17 00:00:00 2001
From: Pierre Augier <pierre.augier@univ-grenoble-alpes.fr>
Date: Sun, 11 May 2025 23:03:38 +0200
Subject: [PATCH 24/32] IS_CPY instead of IS_PYPY (for GraalPy)

---
 Makefile                  | 1 +
 bench/bench_cpy_vs_hpy.py | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index ef7e508..6f5b63f 100644
--- a/Makefile
+++ b/Makefile
@@ -11,6 +11,7 @@ endif
 
 editable:
 	$(PYTHON) -m pip install -e .
+	rm -f piconumpy/_piconumpy_hpy.py
 
 editable_universal:
 	$(PYTHON) -m pip install -e . --config-settings="--global-option=--hpy-abi=universal"
diff --git a/bench/bench_cpy_vs_hpy.py b/bench/bench_cpy_vs_hpy.py
index 9e95fd3..adee1df 100644
--- a/bench/bench_cpy_vs_hpy.py
+++ b/bench/bench_cpy_vs_hpy.py
@@ -15,7 +15,7 @@ def my_randn(mod, n):
     return result
 
 
-IS_PYPY = hasattr(sys, "pypy_version_info")
+IS_CPY = sys.implementation.name == "cpython"
 
 
 def runge_kutta_step(mod, f, x0, dt, t=None):
@@ -130,7 +130,7 @@ def main():
     if norm:
         print(f" ({t_hpy_univ/norm:4.1f} * Julia)")
 
-    if not IS_PYPY:
+    if IS_CPY:
         import piconumpy._piconumpy_hpy as pnp_hpy
 
         t_hpy_cpy_abi = bench(pnp_hpy, N_SLEDS, N_TIME)
@@ -139,7 +139,7 @@ def main():
         if norm:
             print(f" ({t_hpy_cpy_abi/norm:4.1f} * Julia)")
 
-    if IS_PYPY:
+    if not IS_CPY:
         import piconumpy.purepy as pnp_with_list
 
         t_with_list = bench(pnp_with_list, N_SLEDS, N_TIME)

From 3523611a0b33a2e8abadaa0dea35fcc2f7f1fc4e Mon Sep 17 00:00:00 2001
From: Pierre Augier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 13 May 2025 07:43:19 +0200
Subject: [PATCH 25/32] microbench: element_wise

---
 bench/microbench_low_level/Makefile           |  2 ++
 bench/microbench_low_level/bench.py           | 19 ++++++++++++
 .../julia/bench_element_wise.jl               | 30 +++++++++++++++++++
 3 files changed, 51 insertions(+)
 create mode 100644 bench/microbench_low_level/julia/bench_element_wise.jl

diff --git a/bench/microbench_low_level/Makefile b/bench/microbench_low_level/Makefile
index 1734cbe..5f874dd 100644
--- a/bench/microbench_low_level/Makefile
+++ b/bench/microbench_low_level/Makefile
@@ -21,6 +21,8 @@ bench_board: tmp/board_julia.txt _bench
 bench_instantiate: NAME_BENCH=instantiate
 bench_instantiate: tmp/instantiate_julia.txt _bench
 
+bench_element_wise: NAME_BENCH=element_wise
+bench_element_wise: tmp/element_wise_julia.txt _bench
 
 _bench:
 	@echo bench $(NAME_BENCH)
diff --git a/bench/microbench_low_level/bench.py b/bench/microbench_low_level/bench.py
index 14a4dea..dcc56d1 100644
--- a/bench/microbench_low_level/bench.py
+++ b/bench/microbench_low_level/bench.py
@@ -22,6 +22,9 @@
     array = ext.array
 elif method == "list":
     array = list
+    if name_bench == "element_wise":
+        sys.exit(0)
+
 elif method == "numpy":
 
     try:
@@ -124,6 +127,22 @@ def instantiate(arr):
     return result
 
 
+def element_wise(arr):
+
+    dt = 0.1
+    x0 = arr
+
+    k1 = x0 * dt
+    k2 = (x0 + k1 / 2) * dt
+    k3 = (x0 + k2 / 2) * dt
+    k4 = (x0 + k3) * dt
+    # workaround for a pypy bug
+    # see https://foss.heptapod.net/pypy/pypy/-/issues/3509
+    # x_new = x0 + (k1 + 2 * k2 + 2 * k3 + k4) / 6
+    x_new = x0 + (k1 + k2 * 2 + k3 * 2 + k4) / 6
+    return x_new
+
+
 compute_from_arr = locals()[name_bench]
 
 
diff --git a/bench/microbench_low_level/julia/bench_element_wise.jl b/bench/microbench_low_level/julia/bench_element_wise.jl
new file mode 100644
index 0000000..b1e0bd6
--- /dev/null
+++ b/bench/microbench_low_level/julia/bench_element_wise.jl
@@ -0,0 +1,30 @@
+using Statistics
+
+function element_wise(arr::Array)
+
+    dt = 0.1
+    x0 = arr
+
+    k1 = x0 * dt
+    k2 = (x0 + k1 / 2) * dt
+    k3 = (x0 + k2 / 2) * dt
+    k4 = (x0 + k3) * dt
+    x_new = x0 + (k1 + 2 * k2 + 2 * k3 + k4) / 6
+
+    return x_new
+
+end
+
+compute_from_arr = element_wise
+
+size = 10000
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))
\ No newline at end of file

From ad9dfc1e3a2197ad56e7c79cf011ba92cb0c0abe Mon Sep 17 00:00:00 2001
From: Pierre Augier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 13 May 2025 07:43:56 +0200
Subject: [PATCH 26/32] Various compatibility improvements

---
 README.md                     |  2 +-
 bench/bench_array1d.py        | 21 +++++++++++++--------
 bench/make_bench_piconumpy.py |  7 ++++---
 pyproject.toml                |  3 ++-
 4 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 6b2a908..75a5e80 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ issue for the future of scientific Python (see [1], [2], [HPy]).
 [2]: https://morepypy.blogspot.com/2019/12/hpy-kick-off-sprint-report.html
 [HPy]: https://github.com/hpyproject/hpy
 
-[HPy] is a very ambitious and promissing project to design a new and better C
+[HPy] is a very ambitious and promising project to design a new and better C
 API for interacting with Python interpreters. It should allow people to write
 Python extensions efficient on different interpreters (CPython, PyPy, Jython,
 IronPython, GraalPython, RustPython, etc.).
diff --git a/bench/bench_array1d.py b/bench/bench_array1d.py
index a73a635..ba4426f 100644
--- a/bench/bench_array1d.py
+++ b/bench/bench_array1d.py
@@ -1,9 +1,14 @@
+import sys
+
 import numpy as np
 
 from numpy import array
 from math import pi, cos, sin
 
-from transonic import jit
+from transonic import jit, wait_for_all_extensions
+
+IS_CPY = sys.implementation.name == "cpython"
+IS_PYPY = sys.implementation.name == "pypy"
 
 # begin code functions (don't remove this line)
 
@@ -75,15 +80,15 @@ def bench(n_sleds, n_time):
 
 # end code functions (don't remove this line)
 
+if IS_CPY or IS_PYPY:
 
-bench_pythran = jit(bench)
-# Numba does not support this code...
-# bench_numba = jit(backend="numba")(bench)
-from transonic import wait_for_all_extensions
+    bench_pythran = jit(bench)
+    # Numba does not support this code...
+    # bench_numba = jit(backend="numba")(bench)
 
-# warmup (compilation of the Pythran extension)
-bench_pythran(1, 1)
-wait_for_all_extensions()
+    # warmup (compilation of the Pythran extension)
+    bench_pythran(1, 1)
+    wait_for_all_extensions()
 
 if __name__ == "__main__":
 
diff --git a/bench/make_bench_piconumpy.py b/bench/make_bench_piconumpy.py
index 4a76e9a..eb54d0e 100644
--- a/bench/make_bench_piconumpy.py
+++ b/bench/make_bench_piconumpy.py
@@ -49,7 +49,8 @@ def create_tmp_file(name_module):
 from math import pi, cos, sin
 from pprint import pprint
 
-IS_PYPY = hasattr(sys, 'pypy_version_info')
+IS_CPY = sys.implementation.name == "cpython"
+
 """
     + code_functions
     + """
@@ -63,7 +64,7 @@ def create_tmp_file(name_module):
 from tmp_purepy_array import bench as bench_piconumpy_purepy_array
 from tmp_cython import bench as bench_cython
 
-if not IS_PYPY:
+if IS_CPY:
     from tmp_hpy import bench as bench_hpy
 
 pprint({key: sys.implementation.__dict__[key] for key in ("cache_tag", "version")})
@@ -95,7 +96,7 @@ def timeit(name_func, name, total_duration=2):
     )
 
 timeit("bench", name="PicoNumpy (CPython C-API)")
-if not IS_PYPY:
+if IS_CPY:
     timeit("bench_hpy", name="PicoNumpy (HPy CPy ABI)")
 timeit("bench_hpy_universal", name="PicoNumpy (HPy Universal)")
 timeit("bench_pythran", name="Transonic-Pythran")
diff --git a/pyproject.toml b/pyproject.toml
index 52670de..32f4dd9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,8 +20,9 @@ documentation = "https://github.com/paugier/piconumpy"
 [project.optional-dependencies]
 test = ["pytest", "numpy"]
 bench = ['transonic', 'numpy', 'pythran']
+profile = ["gprof2dot"]
 format = ['black']
-full = ["piconumpy[test,bench,format]"]
+full = ["piconumpy[test,bench,profile,format]"]
 
 [build-system]
 requires = [

From 442fdb72bd748b35b252636497381a9785233c43 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 13 May 2025 10:54:03 +0200
Subject: [PATCH 27/32] makefile: compat PyPy and GraalPy

---
 Makefile | 45 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index 6f5b63f..0b5cae2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,38 +1,51 @@
 
 ifeq ($(PYTHON),)
-PYTHON := python
+PYTHON := python3
 endif
 
+IMPLEMENTATION := $(shell $(PYTHON) -c "import sys; print(sys.implementation.name)")
+
+
 all:
 	make editable_universal
-ifeq ($(PYTHON),python)
+ifeq ($(IMPLEMENTATION),cpython)
 	make editable
 endif
 
+
+rm_hpy_py:
+	rm -f piconumpy/_piconumpy_hpy.py
+
 editable:
 	$(PYTHON) -m pip install -e .
-	rm -f piconumpy/_piconumpy_hpy.py
+	make rm_hpy_py
 
 editable_universal:
 	$(PYTHON) -m pip install -e . --config-settings="--global-option=--hpy-abi=universal"
-	rm -f piconumpy/_piconumpy_hpy.py
+	make rm_hpy_py
+
+editable_full:
+	$(PYTHON) -m pip install -e .[full]
+	make rm_hpy_py
 
+
+# deprecated but let's keep them
 develop:
 	$(PYTHON) setup.py develop
+	make rm_hpy_py
 
 develop_universal:
 	$(PYTHON) setup.py --hpy-abi=universal develop
-	rm -f piconumpy/_piconumpy_hpy.py
+	make rm_hpy_py
 
 build_ext_universal:
 	$(PYTHON) setup.py --hpy-abi=universal build_ext -if
+	make rm_hpy_py
 
 build_ext:
 	$(PYTHON) setup.py build_ext -if
-	rm -f piconumpy/_piconumpy_hpy.py
+	make rm_hpy_py
 
-full:
-	$(PYTHON) -m pip install -e .[full]
 
 format:
 	black -l 82 setup.py piconumpy/*.py
@@ -46,4 +59,18 @@ clean:
 	rm -rf build dist piconumpy.egg-info
 
 black:
-	black -l 82 .
\ No newline at end of file
+	black -l 82 .
+
+
+install_pypy:
+	uv python install pypy
+
+install_graalpy:
+	uv python install graalpy
+
+create_venv_pypy:
+	$(shell uv python find pypy) -m venv .venv_pypy --upgrade-deps
+
+create_venv_graalpy:
+	# cannot use --upgrade-deps because pip is patched for GraalPy
+	$(shell uv python find graalpy) -m venv .venv_graalpy

From 89ff2149ac2cf2e343b51616243b72044a9d8336 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 13 May 2025 17:11:39 +0200
Subject: [PATCH 28/32] Add .mdformat.toml

---
 .mdformat.toml | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .mdformat.toml

diff --git a/.mdformat.toml b/.mdformat.toml
new file mode 100644
index 0000000..972483a
--- /dev/null
+++ b/.mdformat.toml
@@ -0,0 +1,3 @@
+wrap = 89
+number = true
+end_of_line = "lf"

From adfc2f44a4282f77738eed09dd1bea7050c4321a Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Tue, 13 May 2025 17:32:21 +0200
Subject: [PATCH 29/32] Improve README and bench

---
 Makefile                      |   3 +
 README.md                     | 164 ++++++++++++++++++++--------------
 bench/Makefile                |   2 +-
 bench/bench_cpy_vs_hpy.py     |   7 +-
 bench/make_bench_piconumpy.py |  17 ++--
 piconumpy/bench.py            |  15 ++--
 6 files changed, 124 insertions(+), 84 deletions(-)

diff --git a/Makefile b/Makefile
index 0b5cae2..d360199 100644
--- a/Makefile
+++ b/Makefile
@@ -68,6 +68,9 @@ install_pypy:
 install_graalpy:
 	uv python install graalpy
 
+create_venv_cpy:
+	$(PYTHON) -m venv .venv_cpy --upgrade-deps
+
 create_venv_pypy:
 	$(shell uv python find pypy) -m venv .venv_pypy --upgrade-deps
 
diff --git a/README.md b/README.md
index 75a5e80..57202b8 100644
--- a/README.md
+++ b/README.md
@@ -5,42 +5,34 @@
 **An experiment about Numpy and HPy**
 
 The C API of CPython is one of the cause of the success of Python in scientific
-computing. In particular, Numpy (and all the Python scientific stack) is built
-on top of this API. However, some characteristics of this API start to be an
-issue for the future of scientific Python (see [1], [2], [HPy]).
+computing. In particular, Numpy (and all the Python scientific stack) is built on top of
+this API. However, some characteristics of this API start to be an issue for the future
+of scientific Python (see [1], [2], [HPy]).
 
-[1]: https://faster-cpython.readthedocs.io/
-[2]: https://morepypy.blogspot.com/2019/12/hpy-kick-off-sprint-report.html
-[HPy]: https://github.com/hpyproject/hpy
-
-[HPy] is a very ambitious and promising project to design a new and better C
-API for interacting with Python interpreters. It should allow people to write
-Python extensions efficient on different interpreters (CPython, PyPy, Jython,
-IronPython, GraalPython, RustPython, etc.).
-
-PyPy would be especially useful for some scientific applications. For example
-for Integration and ODEs
-([scipy.integrate](https://docs.scipy.org/doc/scipy/reference/integrate.html)),
-for which there are a lot of callbacks of very small functions. This repository
-contains [a tiny benchmark](bench/without_numpy) showing that as long as Numpy
-is not used, PyPy is very efficient for such task. Unfortunately, as soon as
-Numpy is used, PyPy becomes very slow!
+[HPy] is a very ambitious and promising project to design a new and better C API for
+interacting with Python interpreters. It should allow people to write Python extensions
+efficient on different interpreters (CPython, PyPy, Jython, IronPython, GraalPython,
+RustPython, etc.).
 
-[bench/without_numpy]: https://github.com/paugier/piconumpy/blob/master/bench/without_numpy/
+PyPy would be especially useful for some scientific applications. For example for
+Integration and ODEs
+([scipy.integrate](https://docs.scipy.org/doc/scipy/reference/integrate.html)), for which
+there are a lot of callbacks of very small functions. This repository contains
+[a tiny benchmark](bench/without_numpy) showing that as long as Numpy is not used, PyPy
+is very efficient for such task. Unfortunately, as soon as Numpy is used, PyPy becomes
+very slow!
 
-With PicoNumpy, I'd like to study if [HPy] could help for codes using Numpy and
-callbacks of small Python functions.
+With PicoNumpy, I'd like to study if [HPy] could help for codes using Numpy and callbacks
+of small Python functions.
 
-We start by a [simple but realistic benchmark](bench/bench_array1d.py) (the
-slow loops only involve pure-Python and very simple Numpy). We then wrote a
-tiny ("pico") implementation of a Numpy like object (just sufficient to run the
-benchmark).
+We start by a [simple but realistic benchmark](bench/bench_array1d.py) (the slow loops
+only involve pure-Python and very simple Numpy). We then wrote a tiny ("pico")
+implementation of a Numpy like object (just sufficient to run the benchmark).
 
-The next task is to reimplement PicoNumpy using [HPy] and to check if PyPy
-could efficiently accelerate [our main benchmark](bench/bench_array1d.py).
+The next task is to reimplement PicoNumpy using [HPy] and to check if PyPy could
+efficiently accelerate [our main benchmark](bench/bench_array1d.py).
 
-PicoNumpy is really tiny. It just provides an `array` class (one-dimensional)
-supporting:
+PicoNumpy is really tiny. It just provides an `array` class (one-dimensional) supporting:
 
 - Instantiation from a list of floats
 - Elementwise multiplication and division by a float
@@ -48,29 +40,25 @@ supporting:
 - Indexing
 - `len`
 
-A good acceleration by PyPy of our example would be a great proof that the
-scientific Python community has to invest time and energy on [HPy].
+A good acceleration by PyPy of our example would be a great proof that the scientific
+Python community has to invest time and energy on [HPy].
 
-In the script [bench_array1d.py](bench/bench_array1d.py), Transonic is used for
-the benchmark and comparison. With Transonic-Pythran, we typically get a 50
-speedup compared to CPython (and ~400 versus PyPy, which is still very slow for
-such codes using Numpy).
-
-[bench/bench_array1d.py]: https://github.com/paugier/piconumpy/blob/master/bench/bench_array1d.py
+In the script [bench_array1d.py](bench/bench_array1d.py), Transonic is used for the
+benchmark and comparison. With Transonic-Pythran, we typically get a 50 speedup compared
+to CPython (and ~400 versus PyPy, which is still very slow for such codes using Numpy).
 
 ## Install and run the benchmarks
 
-**Warning:** PicoNumpy now depends on HPy, which still has to be installed from
-the [Git repository](https://github.com/hpyproject/hpy). For now, the
-installation is a bit more complex that what is described here (more about this
+**Warning:** PicoNumpy depends on HPy >=0.9.0. For now, the installation is a bit more
+complex that what is described here (more about this
 [here](#more-precise-notes-on-how-to-install-and-run-the-benchmarks-with-PyPy)).
 
-`make` should install the package in editable mode. `cd bench; make` should run
-the benchmarks. For the benchmarks, Julia is used for a good comparison point
-so the command `julia` has to be available.
+`make` should install the package in editable mode. `cd bench; make` should run the
+benchmarks. For the benchmarks, Julia is used for a good comparison point so the command
+`julia` has to be available.
 
-For PyPy, the Makefiles are sensible to the environment variable `PYTHON`, so
-you could do:
+For PyPy, the Makefiles are sensible to the environment variable `PYTHON`, so you could
+do:
 
 ```bash
 export PYTHON=pypy3
@@ -79,8 +67,8 @@ cd bench
 make
 ```
 
-The benchmark code can be profiled for the different implementations with the
-commands (you need gprof2dot and graphviz):
+The benchmark code can be profiled for the different implementations with the commands
+(you need gprof2dot and graphviz):
 
 ```bash
 cd bench
@@ -90,48 +78,82 @@ make profile METHOD="purepy"
 make profile METHOD="cython"
 ```
 
-### More precise notes on how to install and run the benchmarks with PyPy
+### Notes on how to install and run the benchmarks with PyPy
 
-Download and extract a nightly PyPy build
-<https://buildbot.pypy.org/nightly/>. Add to the `PATH` environment variable
-the path of the directory containing the `pypy` executable (something like
-`~/opt/pypy-c-jit-101190-b661dc329618-linux64/bin`). Then, you should be able
-to run:
+PyPy can be downloaded with UV or manually (for example from
+<https://buildbot.pypy.org/nightly/> for a nightly build).
 
-```bash
-pypy -m ensurepip
-pypy -m pip install pip -U
-pypy -m pip install numpy cython pytest transonic pythran
+With UV, one can run
+
+```sh
+uv python install pypy
 ```
 
-One can check which HPy version is vendored with PyPy:
+and then get the path towards `pypy` executable with:
 
-```bash
-pypy -c "import hpy.universal as u; print(u.get_version())"
+```sh
+uv python find pypy
 ```
 
-gives `('0.0.3', '2196f14')`.
+which can give something like
+`~/.local/share/uv/python/pypy-3.11.11-linux-x86_64-gnu/bin/pypy`.
 
-Now we can build-install PicoNumpy:
+Then, you should be able to create a virtual environment, activate it and build-install
+PicoNumpy with
 
 ```bash
-cd ~/Dev/piconumpy
-pypy setup.py --hpy-abi=universal develop
+cd ~/dev/piconumpy
+~/.local/share/uv/python/pypy-3.11.11-linux-x86_64-gnu/bin/pypy -m venv .venv_pypy --upgrade-deps
+. .venv_pypy/bin/activate
+pip install -e .[full]
 ```
 
-And run the benchmarks with:
+and run the benchmarks with:
 
 ```bash
-export PYTHON="pypy"
+cd bench
 make clean
 make bench_hpy
 make
 ```
 
+Note that one can check which HPy version is vendored with PyPy:
+
+```bash
+python -c "import hpy.universal as u; print(u.get_version())"
+```
+
+### Notes on how to install and run the benchmarks with GraalPy
+
+GraalPy can be downloaded with UV with
+
+```sh
+uv python install graalpy
+```
+
+Then, one can run
+
+```sh
+cd ~/dev/piconumpy
+# cannot use --upgrade-deps because pip is patched for GraalPy
+$(uv python find graalpy) -m venv .venv_graalpy
+. .venv_graalpy/bin/activate
+# we don't try to run the full benchmarks using Pythran on GraalPy
+pip install -e .[test,profile]
+```
+
+and run the benchmarks with:
+
+```bash
+cd bench
+make clean
+make bench_hpy
+```
+
 ## Few results
 
-As of today (12 October 2021), HPy is not yet ready for high performance, but at
-least (with HPy 0.0.3) it runs !
+As of today (12 October 2021), HPy is not yet ready for high performance, but at least
+(with HPy 0.0.3) it runs !
 
 ### At home (Intel(R) Core(TM) i5-8400 CPU @ 2.80GHz)
 
@@ -183,3 +205,7 @@ CPython C-API:   0.592 seconds (34.6 * Julia)
 HPy [Universal]: 0.207 seconds (12.1 * Julia)
 Python list:     0.093 seconds ( 5.4 * Julia)
 ```
+
+[1]: https://faster-cpython.readthedocs.io/
+[2]: https://morepypy.blogspot.com/2019/12/hpy-kick-off-sprint-report.html
+[hpy]: https://github.com/hpyproject/hpy
diff --git a/bench/Makefile b/bench/Makefile
index 7da6e64..59359f6 100644
--- a/bench/Makefile
+++ b/bench/Makefile
@@ -7,7 +7,7 @@ ifeq ($(METHOD),)
 METHOD := cpython-c-api
 endif
 
-all: tmp.py tmp_result_julia.txt
+bench_full: tmp.py tmp_result_julia.txt
 	$(PYTHON) tmp.py
 
 tmp.py: bench_array1d.py make_bench_piconumpy.py
diff --git a/bench/bench_cpy_vs_hpy.py b/bench/bench_cpy_vs_hpy.py
index adee1df..1bb35dd 100644
--- a/bench/bench_cpy_vs_hpy.py
+++ b/bench/bench_cpy_vs_hpy.py
@@ -1,9 +1,11 @@
-import sys
-from time import perf_counter
 import random
+import socket
+import sys
+
 from math import pi, cos, sin
 from pathlib import Path
 from pprint import pprint
+from time import perf_counter
 
 here = Path(__file__).absolute().parent
 
@@ -107,6 +109,7 @@ def main():
     import piconumpy._piconumpy_cpython_capi as pnp_capi
 
     pprint({key: sys.implementation.__dict__[key] for key in ("cache_tag", "version")})
+    print(f"hostname: {socket.gethostname()}")
 
     tmp_result_julia = Path("tmp_result_julia.txt")
     if tmp_result_julia.exists():
diff --git a/bench/make_bench_piconumpy.py b/bench/make_bench_piconumpy.py
index eb54d0e..4f92bcc 100644
--- a/bench/make_bench_piconumpy.py
+++ b/bench/make_bench_piconumpy.py
@@ -43,12 +43,17 @@ def create_tmp_file(name_module):
 
 code = (
     """
+import socket
 import sys
-import numpy as np
-from piconumpy import array
+
 from math import pi, cos, sin
+from pathlib import Path
 from pprint import pprint
 
+import numpy as np
+
+from piconumpy import array
+
 IS_CPY = sys.implementation.name == "cpython"
 
 """
@@ -68,10 +73,12 @@ def create_tmp_file(name_module):
     from tmp_hpy import bench as bench_hpy
 
 pprint({key: sys.implementation.__dict__[key] for key in ("cache_tag", "version")})
-
+print(f"hostname: {socket.gethostname()}")
 # get norm from Julia benchmark
-with open("tmp_result_julia.txt") as file:
-    norm = float(file.read())
+
+path_julia_result = Path("tmp_result_julia.txt")
+assert path_julia_result.exists()
+norm = float(path_julia_result.read_text())
 
 max_length_name = len("piconumpy (CPython C-API)") + 2
 
diff --git a/piconumpy/bench.py b/piconumpy/bench.py
index a704e5f..a277d4a 100644
--- a/piconumpy/bench.py
+++ b/piconumpy/bench.py
@@ -11,6 +11,13 @@ def timeit_verbose(
     print_time=False,
     max_length_name=33,
 ):
+    if name is None:
+        name = stmt.split("(")[0]
+
+    fmt_name = f"{{:{max_length_name}s}}"
+    name = fmt_name.format(name)
+    print(f"{name}:", end="", flush=True)
+
     result = timeit(
         stmt, setup=setup, total_duration=total_duration, globals=globals
     )
@@ -20,18 +27,12 @@ def timeit_verbose(
     else:
         norm_given = True
 
-    if name is None:
-        name = stmt.split("(")[0]
-
-    fmt_name = f"{{:{max_length_name}s}}"
-    name = fmt_name.format(name)
-
     if print_time:
         raw_time = f" = {result:7.3g} s"
     else:
         raw_time = ""
 
-    print(f"{name}: {result/norm:5.3g} * norm{raw_time}")
+    print(f"\r{name}: {result/norm:5.3g} * norm{raw_time}")
     if not norm_given and not print_time:
         print(f"norm = {norm:5.3g} s")
 

From 4d53bc593ff69b7a7a77ed7f73b04254f59b0b42 Mon Sep 17 00:00:00 2001
From: Pierre Augier <pierre.augier@univ-grenoble-alpes.fr>
Date: Wed, 14 May 2025 06:48:05 +0200
Subject: [PATCH 30/32] Fix microbench_low_level to understand bench results

---
 Makefile                                      |   2 +-
 README.md                                     | 161 ++++++++++++------
 bench/Makefile                                |   9 +-
 bench/microbench_low_level/bench.py           |  13 +-
 .../microbench_low_level/julia/bench_board.jl |   2 +-
 .../julia/bench_element_wise.jl               |   4 +-
 .../julia/bench_init_zeros.jl                 |   2 +-
 .../julia/bench_instantiate.jl                |   2 +-
 ...lt_initialize.md => result_instantiate.md} |   0
 9 files changed, 130 insertions(+), 65 deletions(-)
 rename bench/microbench_low_level/{result_initialize.md => result_instantiate.md} (100%)

diff --git a/Makefile b/Makefile
index d360199..92877da 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ format:
 	black -l 82 setup.py piconumpy/*.py
 	clang-format-7 -i piconumpy/*cpython_capi.c
 
-tests:
+tests: rm_hpy_py
 	$(PYTHON) -m pytest piconumpy -s
 
 clean:
diff --git a/README.md b/README.md
index 57202b8..f473ec5 100644
--- a/README.md
+++ b/README.md
@@ -35,8 +35,8 @@ efficiently accelerate [our main benchmark](bench/bench_array1d.py).
 PicoNumpy is really tiny. It just provides an `array` class (one-dimensional) supporting:
 
 - Instantiation from a list of floats
-- Elementwise multiplication and division by a float
-- Elementwise addition (of 2 arrays)
+- Element-wise multiplication and division by a float
+- Element-wise addition (of 2 arrays)
 - Indexing
 - `len`
 
@@ -44,31 +44,26 @@ A good acceleration by PyPy of our example would be a great proof that the scien
 Python community has to invest time and energy on [HPy].
 
 In the script [bench_array1d.py](bench/bench_array1d.py), Transonic is used for the
-benchmark and comparison. With Transonic-Pythran, we typically get a 50 speedup compared
+benchmark and comparison. With Transonic-Pythran, we typically get a 50 speed-up compared
 to CPython (and ~400 versus PyPy, which is still very slow for such codes using Numpy).
 
 ## Install and run the benchmarks
 
-**Warning:** PicoNumpy depends on HPy >=0.9.0. For now, the installation is a bit more
-complex that what is described here (more about this
-[here](#more-precise-notes-on-how-to-install-and-run-the-benchmarks-with-PyPy)).
+`pip install -e .[full]` should build and install the package in editable mode and all
+dependencies necessary for testing, benchmarking and profiling.
 
-`make` should install the package in editable mode. `cd bench; make` should run the
-benchmarks. For the benchmarks, Julia is used for a good comparison point so the command
-`julia` has to be available.
+For the benchmarks, Julia is used for a good comparison point so the command `julia` has
+to be available. Different benchmarks can be run with
 
-For PyPy, the Makefiles are sensible to the environment variable `PYTHON`, so you could
-do:
-
-```bash
-export PYTHON=pypy3
-make
+```sh
 cd bench
-make
+make clean
+make bench_hpy
+make bench_full
 ```
 
-The benchmark code can be profiled for the different implementations with the commands
-(you need gprof2dot and graphviz):
+The benchmark code can be profiled for the different piconumpy implementations with the
+commands (you need gprof2dot and graphviz):
 
 ```bash
 cd bench
@@ -78,7 +73,7 @@ make profile METHOD="purepy"
 make profile METHOD="cython"
 ```
 
-### Notes on how to install and run the benchmarks with PyPy
+### Notes on PyPy
 
 PyPy can be downloaded with UV or manually (for example from
 <https://buildbot.pypy.org/nightly/> for a nightly build).
@@ -103,7 +98,7 @@ PicoNumpy with
 
 ```bash
 cd ~/dev/piconumpy
-~/.local/share/uv/python/pypy-3.11.11-linux-x86_64-gnu/bin/pypy -m venv .venv_pypy --upgrade-deps
+$(uv python find pypy) -m venv .venv_pypy --upgrade-deps
 . .venv_pypy/bin/activate
 pip install -e .[full]
 ```
@@ -114,7 +109,7 @@ and run the benchmarks with:
 cd bench
 make clean
 make bench_hpy
-make
+make bench_full
 ```
 
 Note that one can check which HPy version is vendored with PyPy:
@@ -123,7 +118,7 @@ Note that one can check which HPy version is vendored with PyPy:
 python -c "import hpy.universal as u; print(u.get_version())"
 ```
 
-### Notes on how to install and run the benchmarks with GraalPy
+### Notes on GraalPy
 
 GraalPy can be downloaded with UV with
 
@@ -152,58 +147,116 @@ make bench_hpy
 
 ## Few results
 
-As of today (12 October 2021), HPy is not yet ready for high performance, but at least
-(with HPy 0.0.3) it runs !
-
-### At home (Intel(R) Core(TM) i5-8400 CPU @ 2.80GHz)
+### Full benchmarks
 
 - With CPython
 
 ```
-Julia                      :     1 * norm = 0.0171 s
-PicoNumpy (CPython C-API)  :  11.1 * norm
-PicoNumpy (HPy CPy ABI)    :  11.6 * norm
-PicoNumpy (HPy Universal)  :  12.1 * norm
-Transonic-Pythran          : 0.537 * norm
-Numpy                      :  33.8 * norm
-PicoNumpy (purepy)         :  43.7 * norm
-PicoNumpy (purepy_array)   :  44.8 * norm
-PicoNumpy (Cython)         :  33.9 * norm
+{'cache_tag': 'cpython-311',
+ 'version': sys.version_info(major=3, minor=11, micro=2, releaselevel='final', serial=0)}
+hostname: meige7ltpa212
+Julia                      :     1 * norm = 0.0129 s
+PicoNumpy (CPython C-API)  :  6.55 * norm
+PicoNumpy (HPy CPy ABI)    :  7.46 * norm
+PicoNumpy (HPy Universal)  :  7.92 * norm
+Transonic-Pythran          : 0.581 * norm
+Numpy                      :  27.1 * norm
+PicoNumpy (purepy)         :  18.8 * norm
+PicoNumpy (purepy_array)   :  31.7 * norm
+PicoNumpy (Cython)         :  23.3 * norm
 ```
 
 - With PyPy3
 
 ```
-Julia                      :     1 * norm = 0.0171 s
-PicoNumpy (CPython C-API)  :  39.2 * norm
-PicoNumpy (HPy Universal)  :  13.1 * norm
-Transonic-Pythran          : 0.562 * norm
-Numpy                      :   286 * norm
-PicoNumpy (purepy)         :  5.59 * norm
-PicoNumpy (purepy_array)   :  7.41 * norm
-PicoNumpy (Cython)         :   282 * norm
+{'cache_tag': 'pypy311',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=19, releaselevel='final', serial=0)}
+hostname: meige7ltpa212
+Julia                      :     1 * norm = 0.0129 s
+PicoNumpy (CPython C-API)  :  35.5 * norm
+PicoNumpy (HPy Universal)  :  44.7 * norm
+Transonic-Pythran          : 0.609 * norm
+Numpy                      :   168 * norm
+PicoNumpy (purepy)         :  2.98 * norm
+PicoNumpy (purepy_array)   :   8.7 * norm
+PicoNumpy (Cython)         :   288 * norm
+```
+
+Discussion: PyPy with HPy universal is really too slow (44.7x slower than Julia, 6x slower than
+CPython with its C-API and even a bit slower that PyPy with cpyext!). This is a big issue
+for HPy!
+
+A reasonable target would be as fast as CPython with its C-API...
+
+Profiling shows that the issue is related to slow element-wise operations as in the micro-benchmark
+
+```sh
+cd microbench_low_level
+make bench_element_wise
+```
+
+- With CPython
+
+```sh
+bench element_wise
+hostname: meige7ltpa212
+{'cache_tag': 'cpython-311',
+ 'version': sys.version_info(major=3, minor=11, micro=2, releaselevel='final', serial=0)}
+piconumpy.purepy              : 7.88e-06 s ( 21.9 * Julia)
+numpy                         : 7.88e-06 s ( 21.9 * Julia)
+piconumpy.hpy (universal)     : 1.34e-06 s (  3.7 * Julia)
+piconumpy.cpython_capi        : 6.12e-07 s (  1.7 * Julia)
+```
+
+- With PyPy3
+
+```sh
+bench element_wise
+hostname: meige7ltpa212
+{'cache_tag': 'pypy311',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=19, releaselevel='final', serial=0)}
+piconumpy.purepy              : 1.46e-06 s (  4.1 * Julia)
+numpy                         : 4.39e-05 s (121.9 * Julia)
+piconumpy.hpy (universal)     : 4.27e-06 s ( 11.9 * Julia)
+piconumpy.cpython_capi        : 1.84e-06 s (  5.1 * Julia)
 ```
 
-#### Simpler benchmarks (bench/bench_cpy_vs_hpy.py)
+### Simpler benchmarks (bench/bench_cpy_vs_hpy.py)
 
 - With CPython
 
 ```
-{'cache_tag': 'cpython-39',
- 'version': sys.version_info(major=3, minor=9, micro=6, releaselevel='final', serial=0)}
-CPython C-API:   0.193 seconds (11.2 * Julia)
-HPy [Universal]: 0.208 seconds (12.1 * Julia)
-HPy [CPy ABI]:   0.201 seconds (11.7 * Julia)
+{'cache_tag': 'cpython-311',
+ 'version': sys.version_info(major=3, minor=11, micro=2, releaselevel='final', serial=0)}
+hostname: meige7ltpa212
+Julia:           0.013 seconds
+CPython C-API:   0.084 seconds ( 6.5 * Julia)
+HPy [Universal]: 0.102 seconds ( 7.9 * Julia)
+HPy [CPy ABI]:   0.096 seconds ( 7.4 * Julia)
 ```
 
 - With PyPy3
 
 ```
-{'cache_tag': 'pypy37',
- 'version': sys.pypy_version_info(major=7, minor=3, micro=6, releaselevel='final', serial=0)}
-CPython C-API:   0.592 seconds (34.6 * Julia)
-HPy [Universal]: 0.207 seconds (12.1 * Julia)
-Python list:     0.093 seconds ( 5.4 * Julia)
+{'cache_tag': 'pypy311',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=19, releaselevel='final', serial=0)}
+hostname: meige7ltpa212
+Julia:           0.013 seconds
+CPython C-API:   0.382 seconds (29.6 * Julia)
+HPy [Universal]: 0.487 seconds (37.6 * Julia)
+Python list:     0.037 seconds ( 2.9 * Julia)
+```
+
+- GraalPy
+
+```
+{'cache_tag': 'graalpy242-311',
+ 'version': sys.version_info(major=3, minor=11, micro=7, releaselevel='final', serial=0)}
+hostname: meige7ltpa212
+Julia:           0.013 seconds
+CPython C-API:   2.123 seconds (164.2 * Julia)
+HPy [Universal]: 1.541 seconds (119.2 * Julia)
+Python list:     0.542 seconds (41.9 * Julia)
 ```
 
 [1]: https://faster-cpython.readthedocs.io/
diff --git a/bench/Makefile b/bench/Makefile
index 59359f6..eb4c4d4 100644
--- a/bench/Makefile
+++ b/bench/Makefile
@@ -7,7 +7,7 @@ ifeq ($(METHOD),)
 METHOD := cpython-c-api
 endif
 
-bench_full: tmp.py tmp_result_julia.txt
+bench_full: rm_hpy_py tmp.py tmp_result_julia.txt
 	$(PYTHON) tmp.py
 
 tmp.py: bench_array1d.py make_bench_piconumpy.py
@@ -20,11 +20,14 @@ clean:
 tmp_result_julia.txt:
 	julia bench.jl > tmp_result_julia.txt
 
-profile: tmp.py
+profile: rm_hpy_py tmp.py
 	$(PYTHON) profile_piconumpy.py $(METHOD)
 	# with gprof2dot and graphviz (command dot)
 	gprof2dot -f pstats tmp.pstats | dot -Tpng -o tmp_$(METHOD).png
 	eog tmp_$(METHOD).png
 
-bench_hpy:
+bench_hpy: rm_hpy_py
 	$(PYTHON) bench_cpy_vs_hpy.py
+
+rm_hpy_py:
+	rm -f ../piconumpy/_piconumpy_hpy.py
diff --git a/bench/microbench_low_level/bench.py b/bench/microbench_low_level/bench.py
index dcc56d1..d9d3a8a 100644
--- a/bench/microbench_low_level/bench.py
+++ b/bench/microbench_low_level/bench.py
@@ -14,6 +14,10 @@
 except IndexError:
     name_bench = "sum_loop"
 
+try:
+    size = sys.argv[3]
+except IndexError:
+    size = None
 
 if method == "_piconumpy_hpy":
     from piconumpy.util_hpy import import_ext
@@ -44,6 +48,8 @@
 if "_piconumpy_" in method:
     method = method.replace("_piconumpy_", "piconumpy.")
 
+if method.endswith("hpy"):
+    method += " (universal)"
 
 tmp_result_julia = Path(f"tmp/{name_bench}_julia.txt")
 if tmp_result_julia.exists():
@@ -145,8 +151,11 @@ def element_wise(arr):
 
 compute_from_arr = locals()[name_bench]
 
-
-size = 10000
+if size is None:
+    if method.startswith("sum_loop") or method == "cort":
+        size = 10000
+    else:
+        size = 4
 
 print(f"{method:30s}:", end="", flush=True)
 
diff --git a/bench/microbench_low_level/julia/bench_board.jl b/bench/microbench_low_level/julia/bench_board.jl
index 63187b1..69d8b64 100644
--- a/bench/microbench_low_level/julia/bench_board.jl
+++ b/bench/microbench_low_level/julia/bench_board.jl
@@ -31,7 +31,7 @@ end
 
 compute_from_arr = board
 
-size = 10000
+size = 4
 nb_runs = 200
 
 times = zeros(nb_runs)
diff --git a/bench/microbench_low_level/julia/bench_element_wise.jl b/bench/microbench_low_level/julia/bench_element_wise.jl
index b1e0bd6..107d3b8 100644
--- a/bench/microbench_low_level/julia/bench_element_wise.jl
+++ b/bench/microbench_low_level/julia/bench_element_wise.jl
@@ -17,8 +17,8 @@ end
 
 compute_from_arr = element_wise
 
-size = 10000
-nb_runs = 200
+size = 4
+nb_runs = 2000
 
 times = zeros(nb_runs)
 
diff --git a/bench/microbench_low_level/julia/bench_init_zeros.jl b/bench/microbench_low_level/julia/bench_init_zeros.jl
index b6035e5..4ac2656 100644
--- a/bench/microbench_low_level/julia/bench_init_zeros.jl
+++ b/bench/microbench_low_level/julia/bench_init_zeros.jl
@@ -8,7 +8,7 @@ end
 
 compute_from_arr = init_zeros
 
-size = 10000
+size = 4
 nb_runs = 200
 
 times = zeros(nb_runs)
diff --git a/bench/microbench_low_level/julia/bench_instantiate.jl b/bench/microbench_low_level/julia/bench_instantiate.jl
index a71cb63..5116e07 100644
--- a/bench/microbench_low_level/julia/bench_instantiate.jl
+++ b/bench/microbench_low_level/julia/bench_instantiate.jl
@@ -9,7 +9,7 @@ end
 
 compute_from_arr = instantiate
 
-size = 10000
+size = 4
 nb_runs = 200
 
 times = zeros(nb_runs)
diff --git a/bench/microbench_low_level/result_initialize.md b/bench/microbench_low_level/result_instantiate.md
similarity index 100%
rename from bench/microbench_low_level/result_initialize.md
rename to bench/microbench_low_level/result_instantiate.md

From b4a99aac9f0cf846f72343082b398ea88bf3e10f Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Wed, 14 May 2025 11:27:34 +0200
Subject: [PATCH 31/32] pythran 0.18 from GitHub

---
 pyproject.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 32f4dd9..ff6b793 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,9 @@ documentation = "https://github.com/paugier/piconumpy"
 
 [project.optional-dependencies]
 test = ["pytest", "numpy"]
-bench = ['transonic', 'numpy', 'pythran']
+# pythran 0.18.0 needed but not yet on PyPI
+# (see https://github.com/serge-sans-paille/pythran/pull/2310#issuecomment-2871805768)
+bench = ['transonic', 'numpy', 'pythran@git+https://github.com/serge-sans-paille/pythran.git@0.18.0']
 profile = ["gprof2dot"]
 format = ['black']
 full = ["piconumpy[test,bench,profile,format]"]

From 010d2ad6d688f850fdabff93032b5796e00710d1 Mon Sep 17 00:00:00 2001
From: paugier <pierre.augier@univ-grenoble-alpes.fr>
Date: Wed, 14 May 2025 11:28:20 +0200
Subject: [PATCH 32/32] Update GitHub Actions ci

---
 .github/workflows/tests.yml | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index a824647..e24b666 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,39 +8,32 @@ jobs:
     strategy:
       max-parallel: 5
       matrix:
-        python-version: ['3.8', '3.9', '3.10', 'pypy-3.7-nightly']
+        python-version: ['3.11', '3.12', 'pypy-3.11']
 
     steps:
 
     - name: Setup Julia
-      uses: julia-actions/setup-julia@v1
+      uses: julia-actions/setup-julia@v2
 
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
 
-    - if: startsWith(matrix.python-version, 'pypy') != true
-      name: Install HPy (only for CPython)
-      run: |
-        # git clone -b release/0.0.3 --single-branch https://github.com/hpyproject/hpy
-        # cd hpy
-        # pip install .
-        pip install hpy>=0.9.0rc1
+    - name: Checkout
+      uses: actions/checkout@v4
 
-    - name: Install dependencies
+    - name: Build and install deps
       run: |
-        pip install numpy cython pytest transonic pythran
+        pip install -e .[full]
 
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 0
+    - if: startsWith(matrix.python-version, 'pypy') != true
+      name: Build universal extension (only needed for CPython)
+      run: |
+        pip install -e . --config-settings="--global-option=--hpy-abi=universal"
 
-    - name: build
+    - name: Remove _piconumpy_hpy.py
       run: |
-        python setup.py develop
-        python setup.py --hpy-abi=universal develop
         rm -f piconumpy/_piconumpy_hpy.py
 
     - name: Run tests
@@ -52,6 +45,6 @@ jobs:
         cd bench
         make tmp_result_julia.txt
         make bench_hpy
-        make
-        # let's rerun bench_hpy to get these results also at the end
+        make bench_full
+        # rerun bench_hpy to get these results also at the end
         make bench_hpy