From 6eb6adff2f1a2fe988e79c53e622b18f9ff7ab26 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Fri, 2 Feb 2024 17:22:54 +0100
Subject: [PATCH 01/13] update requirements

---
 azure-pipelines.yml  | 4 ++--
 requirements-dev.txt | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 907bb9f..61587f4 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -4,8 +4,8 @@ jobs:
     vmImage: 'ubuntu-latest'
   strategy:
     matrix:
-      Python311-Linux:
-        python.version: '3.11'
+      Python312-Linux:
+        python.version: '3.12'
     maxParallel: 3
 
   steps:
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 5804529..5e262e3 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -11,7 +11,7 @@ lightgbm
 matplotlib
 ml-dtypes
 git+https://github.com/onnx/onnxmltools.git
-onnxruntime>=1.16.1
+onnxruntime>=1.17.0
 openpyxl
 packaging
 pandas

From d520cfa8f996aaf5c7224efd00dd0cd76e9775f6 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Mon, 5 Feb 2024 13:03:25 +0100
Subject: [PATCH 02/13] Add class to yield results

---
 _doc/api/reference.rst                        | 10 ++
 .../ut_reference/test_evaluator_yield.py      | 77 +++++++++++++++
 onnx_array_api/reference/__init__.py          |  1 +
 onnx_array_api/reference/evaluator_yield.py   | 98 +++++++++++++++++++
 4 files changed, 186 insertions(+)
 create mode 100644 _unittests/ut_reference/test_evaluator_yield.py
 create mode 100644 onnx_array_api/reference/evaluator_yield.py

diff --git a/_doc/api/reference.rst b/_doc/api/reference.rst
index acbf90a..733a7fd 100644
--- a/_doc/api/reference.rst
+++ b/_doc/api/reference.rst
@@ -5,3 +5,13 @@ ExtendedReferenceEvaluator
 ++++++++++++++++++++++++++
 
 .. autoclass:: onnx_array_api.reference.ExtendedReferenceEvaluator
+    :members:
+
+YieldEvaluator
+++++++++++++++
+
+.. autoclass:: onnx_array_api.reference.ResultType
+    :members:
+
+.. autoclass:: onnx_array_api.reference.YieldEvaluator
+    :members:
diff --git a/_unittests/ut_reference/test_evaluator_yield.py b/_unittests/ut_reference/test_evaluator_yield.py
new file mode 100644
index 0000000..667ba01
--- /dev/null
+++ b/_unittests/ut_reference/test_evaluator_yield.py
@@ -0,0 +1,77 @@
+import unittest
+import numpy as np
+from onnx import TensorProto
+from onnx.helper import (
+    make_function,
+    make_graph,
+    make_model,
+    make_node,
+    make_opsetid,
+    make_tensor_value_info,
+)
+from onnx_array_api.ext_test_case import ExtTestCase
+from onnx_array_api.reference import YieldEvaluator, ResultType
+
+
+class TestArrayTensor(ExtTestCase):
+    def test_evaluator_yield(self):
+        new_domain = "custom_domain"
+        opset_imports = [make_opsetid("", 14), make_opsetid(new_domain, 1)]
+
+        node1 = make_node("MatMul", ["X", "A"], ["XA"])
+        node2 = make_node("Add", ["XA", "B"], ["Y"])
+
+        linear_regression = make_function(
+            new_domain,
+            "LinearRegression",
+            ["X", "A", "B"],
+            ["Y"],
+            [node1, node2],
+            opset_imports,
+            [],
+        )
+
+        X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None])
+        A = make_tensor_value_info("A", TensorProto.FLOAT, [None, None])
+        B = make_tensor_value_info("B", TensorProto.FLOAT, [None, None])
+        Y = make_tensor_value_info("Y", TensorProto.FLOAT, None)
+
+        graph = make_graph(
+            [
+                make_node(
+                    "LinearRegression", ["X", "A", "B"], ["Y1"], domain=new_domain
+                ),
+                make_node("Abs", ["Y1"], ["Y"]),
+            ],
+            "example",
+            [X, A, B],
+            [Y],
+        )
+
+        onnx_model = make_model(
+            graph, opset_imports=opset_imports, functions=[linear_regression]
+        )
+
+        cst = np.arange(4).reshape((-1, 2)).astype(np.float32)
+        yield_eval = YieldEvaluator(onnx_model)
+        results = list(
+            yield_eval.enumerate_results(None, {"A": cst, "B": cst, "X": cst})
+        )
+        expected = [
+            (ResultType.INPUT, "A", np.array([[0.0, 1.0], [2.0, 3.0]], dtype=np.float32)),
+            (ResultType.INPUT, "B", np.array([[0.0, 1.0], [2.0, 3.0]], dtype=np.float32)),
+            (ResultType.INPUT, "X", np.array([[0.0, 1.0], [2.0, 3.0]], dtype=np.float32)),
+            (ResultType.RESULT, "Y1", np.array([[2.0, 4.0], [8.0, 14.0]], dtype=np.float32)),
+            (ResultType.RESULT, "Y", np.array([[2.0, 4.0], [8.0, 14.0]], dtype=np.float32)),
+            (ResultType.OUTPUT, "Y", np.array([[2.0, 4.0], [8.0, 14.0]], dtype=np.float32)),
+        ]
+        self.assertEqual(len(expected), len(results))
+        for a, b in zip(expected, results):
+            self.assertEqual(len(a), len(b))
+            self.assertEqual(a[0], b[0])
+            self.assertEqual(a[1], b[1])
+            self.assertEqual(a[2].tolist(), b[2].tolist())
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/onnx_array_api/reference/__init__.py b/onnx_array_api/reference/__init__.py
index d8c5aa5..8f956ab 100644
--- a/onnx_array_api/reference/__init__.py
+++ b/onnx_array_api/reference/__init__.py
@@ -11,6 +11,7 @@
 )
 from onnx.reference.op_run import to_array_extended
 from .evaluator import ExtendedReferenceEvaluator
+from .evaluator_yield import YieldEvaluator, ResultType
 
 
 def from_array_extended(tensor: np.array, name: Optional[str] = None) -> TensorProto:
diff --git a/onnx_array_api/reference/evaluator_yield.py b/onnx_array_api/reference/evaluator_yield.py
new file mode 100644
index 0000000..d10ccbc
--- /dev/null
+++ b/onnx_array_api/reference/evaluator_yield.py
@@ -0,0 +1,98 @@
+from typing import Any, Dict, List, Iterator, Optional, Tuple
+from enum import IntEnum
+from onnx import ModelProto
+from .evaluator import ExtendedReferenceEvaluator
+
+
+class ResultType(IntEnum):
+    RESULT = 1
+    INITIALIZER = 2
+    SPARSE_INITIALIZER = 4
+    INPUT = 8
+    OUTPUT = 16
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}.{self._name_}"
+
+
+class YieldEvaluator:
+    """
+    This class implements method `enumerate_results` which iterates on
+    intermediates results. By default, it uses
+    :class:`onnx_array_api.evaluator.ExtendedReferenceEvaluator`.
+
+    :param onnx_model: model to run
+    :param recursive: dig into subgraph and functions as well
+    """
+
+    def __init__(
+        self,
+        onnx_model: ModelProto,
+        recursive: bool = False,
+        cls=ExtendedReferenceEvaluator,
+    ):
+        assert not recursive, "recursive=True is not yet implemented"
+        self.onnx_model = onnx_model
+        self.evaluator = cls(onnx_model) if cls is not None else None
+
+    def enumerate_results(
+        self,
+        output_names: Optional[List[str]] = None,
+        feed_inputs: Optional[Dict[str, Any]] = None,
+    ) -> Iterator[Tuple[ResultType, str, Any]]:
+        """
+        Executes the onnx model.
+
+        Args:
+            output_names: requested outputs by names, None for all
+            feed_inputs: dictionary `{ input name: input value }`
+
+        Returns:
+            iterator on tuple(result kind, name, value)
+        """
+        assert isinstance(self.evaluator, ExtendedReferenceEvaluator), (
+            f"This implementation only works with "
+            f"ExtendedReferenceEvaluator not {type(self.evaluator)}"
+        )
+        attributes = {}
+        if output_names is None:
+            output_names = self.evaluator.output_names
+
+        results = {"": None}
+        results.update(self.evaluator.rt_inits_)
+        results.update(feed_inputs)
+        # step 0: initializer
+        for k, v in self.evaluator.rt_inits_.items():
+            yield ResultType.INITIALIZER, k, v
+        # step 1: inputs
+        for k, v in feed_inputs.items():
+            yield ResultType.INPUT, k, v
+
+        # step 2: execute nodes
+        for node in self.evaluator.rt_nodes_:
+            for i in node.input:
+                if i not in results:
+                    raise RuntimeError(
+                        f"Unable to find input {i!r} in known results {sorted(results)}, "
+                        f"self.rt_inits_ has {sorted(self.evaluator.rt_inits_)}, "
+                        f"feed_inputs has {sorted(feed_inputs)}."
+                    )
+            inputs = [results[i] for i in node.input]
+            linked_attributes = {}
+            if node.has_linked_attribute and attributes:
+                linked_attributes["linked_attributes"] = attributes
+            if node.need_context():
+                outputs = node.run(*inputs, context=results, **linked_attributes)
+            else:
+                outputs = node.run(*inputs, **linked_attributes)
+            for name, value in zip(node.output, outputs):
+                yield ResultType.RESULT, name, value
+                results[name] = value
+
+        # step 3: outputs
+        for name in output_names:
+            if name not in results:
+                raise RuntimeError(
+                    f"Unable to find output name {name!r} in {sorted(results)}, proto is\n{self.proto_}"
+                )
+            yield ResultType.OUTPUT, name, results[name]

From d616c58f60c38f382dfa73481310abba8c6ed8d4 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Mon, 5 Feb 2024 13:04:31 +0100
Subject: [PATCH 03/13] black

---
 .../ut_reference/test_evaluator_yield.py      | 36 +++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/_unittests/ut_reference/test_evaluator_yield.py b/_unittests/ut_reference/test_evaluator_yield.py
index 667ba01..952a782 100644
--- a/_unittests/ut_reference/test_evaluator_yield.py
+++ b/_unittests/ut_reference/test_evaluator_yield.py
@@ -58,12 +58,36 @@ def test_evaluator_yield(self):
             yield_eval.enumerate_results(None, {"A": cst, "B": cst, "X": cst})
         )
         expected = [
-            (ResultType.INPUT, "A", np.array([[0.0, 1.0], [2.0, 3.0]], dtype=np.float32)),
-            (ResultType.INPUT, "B", np.array([[0.0, 1.0], [2.0, 3.0]], dtype=np.float32)),
-            (ResultType.INPUT, "X", np.array([[0.0, 1.0], [2.0, 3.0]], dtype=np.float32)),
-            (ResultType.RESULT, "Y1", np.array([[2.0, 4.0], [8.0, 14.0]], dtype=np.float32)),
-            (ResultType.RESULT, "Y", np.array([[2.0, 4.0], [8.0, 14.0]], dtype=np.float32)),
-            (ResultType.OUTPUT, "Y", np.array([[2.0, 4.0], [8.0, 14.0]], dtype=np.float32)),
+            (
+                ResultType.INPUT,
+                "A",
+                np.array([[0.0, 1.0], [2.0, 3.0]], dtype=np.float32),
+            ),
+            (
+                ResultType.INPUT,
+                "B",
+                np.array([[0.0, 1.0], [2.0, 3.0]], dtype=np.float32),
+            ),
+            (
+                ResultType.INPUT,
+                "X",
+                np.array([[0.0, 1.0], [2.0, 3.0]], dtype=np.float32),
+            ),
+            (
+                ResultType.RESULT,
+                "Y1",
+                np.array([[2.0, 4.0], [8.0, 14.0]], dtype=np.float32),
+            ),
+            (
+                ResultType.RESULT,
+                "Y",
+                np.array([[2.0, 4.0], [8.0, 14.0]], dtype=np.float32),
+            ),
+            (
+                ResultType.OUTPUT,
+                "Y",
+                np.array([[2.0, 4.0], [8.0, 14.0]], dtype=np.float32),
+            ),
         ]
         self.assertEqual(len(expected), len(results))
         for a, b in zip(expected, results):

From fe32241f9d4b079a701a52ddc19d8621b0175f57 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Mon, 5 Feb 2024 13:19:31 +0100
Subject: [PATCH 04/13] add sumarry

---
 .../ut_reference/test_evaluator_yield.py      | 73 +++++++++++++++++++
 onnx_array_api/reference/evaluator_yield.py   | 59 +++++++++++++--
 2 files changed, 126 insertions(+), 6 deletions(-)

diff --git a/_unittests/ut_reference/test_evaluator_yield.py b/_unittests/ut_reference/test_evaluator_yield.py
index 952a782..daf9a93 100644
--- a/_unittests/ut_reference/test_evaluator_yield.py
+++ b/_unittests/ut_reference/test_evaluator_yield.py
@@ -62,31 +62,37 @@ def test_evaluator_yield(self):
                 ResultType.INPUT,
                 "A",
                 np.array([[0.0, 1.0], [2.0, 3.0]], dtype=np.float32),
+                None,
             ),
             (
                 ResultType.INPUT,
                 "B",
                 np.array([[0.0, 1.0], [2.0, 3.0]], dtype=np.float32),
+                None,
             ),
             (
                 ResultType.INPUT,
                 "X",
                 np.array([[0.0, 1.0], [2.0, 3.0]], dtype=np.float32),
+                None,
             ),
             (
                 ResultType.RESULT,
                 "Y1",
                 np.array([[2.0, 4.0], [8.0, 14.0]], dtype=np.float32),
+                "LinearRegression",
             ),
             (
                 ResultType.RESULT,
                 "Y",
                 np.array([[2.0, 4.0], [8.0, 14.0]], dtype=np.float32),
+                "Abs",
             ),
             (
                 ResultType.OUTPUT,
                 "Y",
                 np.array([[2.0, 4.0], [8.0, 14.0]], dtype=np.float32),
+                None,
             ),
         ]
         self.assertEqual(len(expected), len(results))
@@ -95,6 +101,73 @@ def test_evaluator_yield(self):
             self.assertEqual(a[0], b[0])
             self.assertEqual(a[1], b[1])
             self.assertEqual(a[2].tolist(), b[2].tolist())
+            self.assertEqual(a[3], b[3])
+
+    def test_evaluator_yield_summary(self):
+        new_domain = "custom_domain"
+        opset_imports = [make_opsetid("", 14), make_opsetid(new_domain, 1)]
+
+        node1 = make_node("MatMul", ["X", "A"], ["XA"])
+        node2 = make_node("Add", ["XA", "B"], ["Y"])
+
+        linear_regression = make_function(
+            new_domain,
+            "LinearRegression",
+            ["X", "A", "B"],
+            ["Y"],
+            [node1, node2],
+            opset_imports,
+            [],
+        )
+
+        X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None])
+        A = make_tensor_value_info("A", TensorProto.FLOAT, [None, None])
+        B = make_tensor_value_info("B", TensorProto.FLOAT, [None, None])
+        Y = make_tensor_value_info("Y", TensorProto.FLOAT, None)
+
+        graph = make_graph(
+            [
+                make_node(
+                    "LinearRegression", ["X", "A", "B"], ["Y1"], domain=new_domain
+                ),
+                make_node("Abs", ["Y1"], ["Y"]),
+            ],
+            "example",
+            [X, A, B],
+            [Y],
+        )
+
+        onnx_model = make_model(
+            graph, opset_imports=opset_imports, functions=[linear_regression]
+        )
+
+        cst = np.arange(4).reshape((-1, 2)).astype(np.float32)
+        yield_eval = YieldEvaluator(onnx_model)
+        results = list(
+            yield_eval.enumerate_summarized(None, {"A": cst, "B": cst, "X": cst})
+        )
+        expected = [
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None),
+            (
+                ResultType.RESULT,
+                np.dtype("float32"),
+                (2, 2),
+                "CEIO",
+                "LinearRegression",
+            ),
+            (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Abs"),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIO", None),
+        ]
+        self.assertEqual(len(expected), len(results))
+        for a, b in zip(expected, results):
+            self.assertEqual(len(a), len(b))
+            self.assertEqual(a[0], b[0])
+            self.assertEqual(a[1], b[1])
+            self.assertEqual(a[2], b[2])
+            self.assertEqual(a[3], b[3])
+            self.assertEqual(a[4], b[4])
 
 
 if __name__ == "__main__":
diff --git a/onnx_array_api/reference/evaluator_yield.py b/onnx_array_api/reference/evaluator_yield.py
index d10ccbc..865bd00 100644
--- a/onnx_array_api/reference/evaluator_yield.py
+++ b/onnx_array_api/reference/evaluator_yield.py
@@ -1,5 +1,6 @@
 from typing import Any, Dict, List, Iterator, Optional, Tuple
 from enum import IntEnum
+import numpy as np
 from onnx import ModelProto
 from .evaluator import ExtendedReferenceEvaluator
 
@@ -41,14 +42,14 @@ def enumerate_results(
         feed_inputs: Optional[Dict[str, Any]] = None,
     ) -> Iterator[Tuple[ResultType, str, Any]]:
         """
-        Executes the onnx model.
+        Executes the onnx model and enumerate all the intermediate results.
 
         Args:
             output_names: requested outputs by names, None for all
             feed_inputs: dictionary `{ input name: input value }`
 
         Returns:
-            iterator on tuple(result kind, name, value)
+            iterator on tuple(result kind, name, value, node.op_type or None)
         """
         assert isinstance(self.evaluator, ExtendedReferenceEvaluator), (
             f"This implementation only works with "
@@ -63,10 +64,10 @@ def enumerate_results(
         results.update(feed_inputs)
         # step 0: initializer
         for k, v in self.evaluator.rt_inits_.items():
-            yield ResultType.INITIALIZER, k, v
+            yield ResultType.INITIALIZER, k, v, None
         # step 1: inputs
         for k, v in feed_inputs.items():
-            yield ResultType.INPUT, k, v
+            yield ResultType.INPUT, k, v, None
 
         # step 2: execute nodes
         for node in self.evaluator.rt_nodes_:
@@ -86,7 +87,7 @@ def enumerate_results(
             else:
                 outputs = node.run(*inputs, **linked_attributes)
             for name, value in zip(node.output, outputs):
-                yield ResultType.RESULT, name, value
+                yield ResultType.RESULT, name, value, node.op_type
                 results[name] = value
 
         # step 3: outputs
@@ -95,4 +96,50 @@ def enumerate_results(
                 raise RuntimeError(
                     f"Unable to find output name {name!r} in {sorted(results)}, proto is\n{self.proto_}"
                 )
-            yield ResultType.OUTPUT, name, results[name]
+            yield ResultType.OUTPUT, name, results[name], None
+
+    def enumerate_summarized(
+        self,
+        output_names: Optional[List[str]] = None,
+        feed_inputs: Optional[Dict[str, Any]] = None,
+    ) -> Iterator[Tuple[ResultType, str, Any]]:
+        """
+        Executes the onnx model and enumerate intermediate results without their names.
+
+        Args:
+            output_names: requested outputs by names, None for all
+            feed_inputs: dictionary `{ input name: input value }`
+
+        Returns:
+            iterator on tuple(result kind, node.type, dtype, shape, value)
+        """
+        for kind, name, value, op_type in self.enumerate_results(
+            output_names, feed_inputs
+        ):
+            summary = self.make_summary(value)
+            yield kind, value.dtype, value.shape, summary, op_type
+
+    def make_summary(self, value: Any, length: int = 4, modulo: int = 26) -> str:
+        """
+        Create a short string summarizing the value (discretization).
+
+        :param value: array
+        :param length: number of value to produce
+        :param module: discretization parameter
+        :return: short string
+        """
+        value4 = np.zeros(4, dtype=np.float64)
+        if value.size <= length:
+            value4[: value.size] = value.flatten().astype(np.float64)
+        else:
+            if value.size % length != 2:
+                value2 = np.zeros(
+                    value.size + length - value.size % length, dtype=np.float64
+                )
+                value2[: value.size] = value.flatten().astype(np.float64)
+            else:
+                value2 = value.flatten().astype(np.float64)
+            value4 = value2.reshape((4, -1)).mean(axis=1)
+        value4i = value4.astype(np.int64) % modulo
+        s = "".join([chr(65 + i) for i in value4i])
+        return s

From a8c2932e1c65a392fe127b168fa9e3cba27af9c1 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Mon, 5 Feb 2024 14:40:32 +0100
Subject: [PATCH 05/13] add distance

---
 CHANGELOGS.rst                                |   1 +
 _doc/api/reference.rst                        |   7 +
 .../ut_reference/test_evaluator_yield.py      | 198 +++++++++++++++++-
 onnx_array_api/reference/__init__.py          |   2 +-
 onnx_array_api/reference/evaluator_yield.py   | 123 ++++++++++-
 5 files changed, 320 insertions(+), 11 deletions(-)

diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
index dad0930..d0b6445 100644
--- a/CHANGELOGS.rst
+++ b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.2.0
 +++++
 
+* :pr:`71`: adds tools to compare two onnx graphs
 * :pr:`61`: adds function to plot onnx model as graphs
 * :pr:`60`: supports translation of local functions
 * :pr:`59`: add methods to update nodes in GraphAPI 
diff --git a/_doc/api/reference.rst b/_doc/api/reference.rst
index 733a7fd..a1507c8 100644
--- a/_doc/api/reference.rst
+++ b/_doc/api/reference.rst
@@ -15,3 +15,10 @@ YieldEvaluator
 
 .. autoclass:: onnx_array_api.reference.YieldEvaluator
     :members:
+
+DistanceExecution
++++++++++++++++++
+
+.. autoclass:: onnx_array_api.reference.DistanceExecution
+    :members:
+
diff --git a/_unittests/ut_reference/test_evaluator_yield.py b/_unittests/ut_reference/test_evaluator_yield.py
index daf9a93..60da2ee 100644
--- a/_unittests/ut_reference/test_evaluator_yield.py
+++ b/_unittests/ut_reference/test_evaluator_yield.py
@@ -10,7 +10,7 @@
     make_tensor_value_info,
 )
 from onnx_array_api.ext_test_case import ExtTestCase
-from onnx_array_api.reference import YieldEvaluator, ResultType
+from onnx_array_api.reference import YieldEvaluator, ResultType, DistanceExecution
 
 
 class TestArrayTensor(ExtTestCase):
@@ -147,18 +147,19 @@ def test_evaluator_yield_summary(self):
             yield_eval.enumerate_summarized(None, {"A": cst, "B": cst, "X": cst})
         )
         expected = [
-            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None),
-            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None),
-            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "A"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "B"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "X"),
             (
                 ResultType.RESULT,
                 np.dtype("float32"),
                 (2, 2),
                 "CEIO",
                 "LinearRegression",
+                "Y1",
             ),
-            (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Abs"),
-            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIO", None),
+            (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Abs", "Y"),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIO", None, "Y"),
         ]
         self.assertEqual(len(expected), len(results))
         for a, b in zip(expected, results):
@@ -168,6 +169,191 @@ def test_evaluator_yield_summary(self):
             self.assertEqual(a[2], b[2])
             self.assertEqual(a[3], b[3])
             self.assertEqual(a[4], b[4])
+            self.assertEqual(a[5], b[5])
+
+    def test_distance_pair(self):
+        el1 = (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None)
+        el2 = el1
+        dc = DistanceExecution()
+        self.assertEqual(dc.distance_pair(el1, el2), 0)
+        el2 = (ResultType.INPUT, np.dtype("float16"), (2, 2), "ABCD", None)
+        self.assertEqual(dc.distance_pair(el1, el2), 2)
+        el2 = (ResultType.OUTPUT, np.dtype("float16"), (2, 2, 4), "GBCD", "Abs")
+        self.assertEqual(dc.distance_pair(el1, el2), 1130)
+        el2 = (ResultType.OUTPUT, np.dtype("float16"), (2, 3), "GBCD", "Abs")
+        self.assertEqual(dc.distance_pair(el1, el2), 1021)
+
+    def test_distance_sequence_0(self):
+        expected = [
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "A"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "B"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "X"),
+            (
+                ResultType.RESULT,
+                np.dtype("float32"),
+                (2, 2),
+                "CEIO",
+                "LinearRegression",
+                "Y1",
+            ),
+            (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Abs", "Y"),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIO", None, "Y"),
+        ]
+
+        dc = DistanceExecution()
+        d, align = dc.distance_sequence(expected, expected)
+        self.assertEqual(d, 0)
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)])
+
+    def test_distance_sequence_ins(self):
+        s1 = [
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "A"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "B"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "X"),
+            (
+                ResultType.RESULT,
+                np.dtype("float32"),
+                (2, 2),
+                "CEIO",
+                "LinearRegression",
+                "Y1",
+            ),
+            (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Abs", "Y"),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIO", None, "Y"),
+        ]
+        s2 = [
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "A"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "B"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "X"),
+            (
+                ResultType.RESULT,
+                np.dtype("float32"),
+                (2, 2),
+                "CEIO",
+                "LinearRegression",
+                "Y1",
+            ),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIO", None, "Y"),
+        ]
+
+        dc = DistanceExecution()
+        d, align = dc.distance_sequence(s1, s2)
+        self.assertEqual(d, dc.insert_cost)
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 3)])
+        d, align = dc.distance_sequence(s2, s1)
+        self.assertEqual(d, dc.insert_cost)
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (3, 4)])
+
+    def test_distance_sequence_equal(self):
+        s1 = [
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "A"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "B"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "X"),
+            (
+                ResultType.RESULT,
+                np.dtype("float32"),
+                (2, 2),
+                "CEIO",
+                "LinearRegression",
+                "Y1",
+            ),
+            (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Abs", "Y"),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIO", None, "Y"),
+        ]
+        s2 = [
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "A"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "B"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "X"),
+            (
+                ResultType.RESULT,
+                np.dtype("float32"),
+                (2, 2),
+                "CEIO",
+                "LinearRegression",
+                "Y1",
+            ),
+            (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Abs", "Z"),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIO", None, "Y"),
+        ]
+
+        dc = DistanceExecution()
+        d, align = dc.distance_sequence(s1, s2)
+        self.assertEqual(d, 0)
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)])
+
+    def test_distance_sequence_diff(self):
+        s1 = [
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "A"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "B"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "X"),
+            (
+                ResultType.RESULT,
+                np.dtype("float32"),
+                (2, 2),
+                "CEIO",
+                "LinearRegression",
+                "Y1",
+            ),
+            (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Abs", "Y"),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIO", None, "Y"),
+        ]
+        s2 = [
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "A"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "B"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "X"),
+            (
+                ResultType.RESULT,
+                np.dtype("float32"),
+                (2, 2),
+                "CEIO",
+                "LinearRegression",
+                "Y1",
+            ),
+            (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIP", "Abs", "Z"),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIO", None, "Y"),
+        ]
+
+        dc = DistanceExecution()
+        d, align = dc.distance_sequence(s1, s2)
+        self.assertEqual(d, 1)
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)])
+
+    def test_distance_sequence_diff2(self):
+        s1 = [
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "A"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "B"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "X"),
+            (
+                ResultType.RESULT,
+                np.dtype("float32"),
+                (2, 2),
+                "CEIO",
+                "LinearRegression",
+                "Y1",
+            ),
+            (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Abs", "Y"),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIO", None, "Y"),
+        ]
+        s2 = [
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "A"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "B"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "X"),
+            (
+                ResultType.RESULT,
+                np.dtype("float32"),
+                (2, 2),
+                "CEIO",
+                "LinearRegression",
+                "Y1",
+            ),
+            (ResultType.RESULT, np.dtype("float32"), (2, 3), "CEIP", "Abs", "Z"),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIP", None, "Y"),
+        ]
+
+        dc = DistanceExecution()
+        d, align = dc.distance_sequence(s1, s2)
+        self.assertEqual(d, 5)
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)])
 
 
 if __name__ == "__main__":
diff --git a/onnx_array_api/reference/__init__.py b/onnx_array_api/reference/__init__.py
index 8f956ab..6f6fc58 100644
--- a/onnx_array_api/reference/__init__.py
+++ b/onnx_array_api/reference/__init__.py
@@ -11,7 +11,7 @@
 )
 from onnx.reference.op_run import to_array_extended
 from .evaluator import ExtendedReferenceEvaluator
-from .evaluator_yield import YieldEvaluator, ResultType
+from .evaluator_yield import DistanceExecution, ResultType, YieldEvaluator
 
 
 def from_array_extended(tensor: np.array, name: Optional[str] = None) -> TensorProto:
diff --git a/onnx_array_api/reference/evaluator_yield.py b/onnx_array_api/reference/evaluator_yield.py
index 865bd00..d347f39 100644
--- a/onnx_array_api/reference/evaluator_yield.py
+++ b/onnx_array_api/reference/evaluator_yield.py
@@ -16,11 +16,14 @@ def __repr__(self):
         return f"{self.__class__.__name__}.{self._name_}"
 
 
+RESULT_TYPE = Tuple[ResultType, "np.dtype", Tuple[int, ...], str, str, str]
+
+
 class YieldEvaluator:
     """
     This class implements method `enumerate_results` which iterates on
     intermediates results. By default, it uses
-    :class:`onnx_array_api.evaluator.ExtendedReferenceEvaluator`.
+    :class:`onnx_array_api.reference.ExtendedReferenceEvaluator`.
 
     :param onnx_model: model to run
     :param recursive: dig into subgraph and functions as well
@@ -102,7 +105,7 @@ def enumerate_summarized(
         self,
         output_names: Optional[List[str]] = None,
         feed_inputs: Optional[Dict[str, Any]] = None,
-    ) -> Iterator[Tuple[ResultType, str, Any]]:
+    ) -> Iterator[RESULT_TYPE]:
         """
         Executes the onnx model and enumerate intermediate results without their names.
 
@@ -111,13 +114,13 @@ def enumerate_summarized(
             feed_inputs: dictionary `{ input name: input value }`
 
         Returns:
-            iterator on tuple(result kind, node.type, dtype, shape, value)
+            iterator on tuple(result kind, node.type, dtype, shape, value, result name)
         """
         for kind, name, value, op_type in self.enumerate_results(
             output_names, feed_inputs
         ):
             summary = self.make_summary(value)
-            yield kind, value.dtype, value.shape, summary, op_type
+            yield kind, value.dtype, value.shape, summary, op_type, name
 
     def make_summary(self, value: Any, length: int = 4, modulo: int = 26) -> str:
         """
@@ -143,3 +146,115 @@ def make_summary(self, value: Any, length: int = 4, modulo: int = 26) -> str:
         value4i = value4.astype(np.int64) % modulo
         s = "".join([chr(65 + i) for i in value4i])
         return s
+
+
+class DistanceExecution:
+    """
+    Computes a distance between two results.
+    """
+
+    float_types = {
+        np.float16,
+        np.float32,
+        np.float64,
+        np.dtype("float16"),
+        np.dtype("float32"),
+        np.dtype("float64"),
+    }
+
+    def __init__(self, max_lag: int = 50):
+        self.kind_cost = 1000
+        self.type_cost = 10
+        self.rank_cost = 100
+        self.op_type_cost = 10
+        self.max_lag = max_lag
+        self.insert_cost = 1000
+
+    def distance_pair(self, r1: RESULT_TYPE, r2: RESULT_TYPE) -> float:
+        """
+        (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Abs"),
+
+        :param r1: first result
+        :param r2: second result
+        :return: distance
+        """
+        d = 0
+        if r1[0] != r2[0]:
+            # difference type
+            d += self.kind_cost
+        if r1[1] != r2[1]:
+            d += self._cost_type(r1[1], r2[1]) * self.type_cost
+        if r1[2] != r2[2]:
+            d += self._cost_shape(r1[2], r2[2])
+        if r1[3] != r2[3]:
+            d += self._cost_summary(r1[3], r2[3])
+        if r1[4] != r2[4]:
+            d += self.op_type_cost
+        return d
+
+    def _cost_type(self, t1: "np.dtype", t2: "np.dtype") -> float:
+        if t1 in self.float_types and t2 in self.float_types:
+            return 0.2
+        return 1
+
+    def _cost_shape(self, s1: Tuple[int, ...], s2: Tuple[int, ...]) -> float:
+        d = abs(np.prod(s1) - np.prod(s2))
+        if len(s1) != len(s2):
+            return self.rank_cost + d
+        for i, j in zip(s1, s2):
+            d += abs(i - j)
+        return d
+
+    def _cost_summary(self, s1: str, s2: str) -> float:
+        if len(s1) != len(s2):
+            return 1e6
+        d = 0
+        for a, b in zip(s1, s2):
+            d += abs(ord(a) - ord(b))
+        return d
+
+    def distance_sequence(
+        self, s1: List[RESULT_TYPE], s2: List[RESULT_TYPE]
+    ) -> Tuple[float, List[Tuple[int, int]]]:
+        """
+        Computes the distance between two sequences of results.
+
+        :param s1: first sequence
+        :param s2: second sequence
+        :return: distance and alignment
+        """
+        delay = self.max_lag
+        distance = {(-1, -1): 0}
+        predecessor = {(-1, -1): None}
+        for i in range(len(s1)):
+            for j in range(max(0, i - delay), min(len(s2), i + delay)):
+                best = 1e100
+                pred = None
+                ki, kj = i - 1, j - 1
+                if (ki, kj) in distance:
+                    d = distance[ki, kj] + self.distance_pair(s1[i], s2[j])
+                    if d < best:
+                        best = d
+                        pred = (ki, kj)
+                ki, kj = i - 1, j
+                if (ki, kj) in distance:
+                    d = distance[ki, kj] + self.insert_cost
+                    if d < best:
+                        best = d
+                        pred = (ki, kj)
+                ki, kj = i, j - 1
+                if (ki, kj) in distance:
+                    d = distance[ki, kj] + self.insert_cost
+                    if d < best:
+                        best = d
+                        pred = (ki, kj)
+                distance[i, j] = best
+                predecessor[i, j] = pred
+
+        # reverse
+        way = []
+        last = predecessor[len(s1) - 1, len(s2) - 1]
+        while last is not None:
+            way.append(last)
+            last = predecessor[last]
+        return distance[len(s1) - 1, len(s2) - 1], list(reversed(way))[1:]

From 09e79d2f6902705b921799d3f5461a5e24e40164 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Mon, 5 Feb 2024 15:35:29 +0100
Subject: [PATCH 06/13] text

---
 .../ut_reference/test_evaluator_yield.py      |  61 +++++++-
 onnx_array_api/reference/__init__.py          |   7 +-
 onnx_array_api/reference/evaluator_yield.py   | 142 +++++++++++++++---
 3 files changed, 184 insertions(+), 26 deletions(-)

diff --git a/_unittests/ut_reference/test_evaluator_yield.py b/_unittests/ut_reference/test_evaluator_yield.py
index 60da2ee..d4ef8a6 100644
--- a/_unittests/ut_reference/test_evaluator_yield.py
+++ b/_unittests/ut_reference/test_evaluator_yield.py
@@ -10,7 +10,12 @@
     make_tensor_value_info,
 )
 from onnx_array_api.ext_test_case import ExtTestCase
-from onnx_array_api.reference import YieldEvaluator, ResultType, DistanceExecution
+from onnx_array_api.reference import (
+    YieldEvaluator,
+    ResultType,
+    DistanceExecution,
+    ResultExecution,
+)
 
 
 class TestArrayTensor(ExtTestCase):
@@ -355,6 +360,60 @@ def test_distance_sequence_diff2(self):
         self.assertEqual(d, 5)
         self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)])
 
+    def test_distance_sequence_str(self):
+        s1 = [
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "A"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "B"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 3), "ABCD", None, "X"),
+            (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Exp", "H"),
+            (
+                ResultType.RESULT,
+                np.dtype("float32"),
+                (2, 2),
+                "CEIO",
+                "LinearRegression",
+                "Y1",
+            ),
+            (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Abs", "Y"),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIO", None, "Y"),
+        ]
+        s2 = [
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "A"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "B"),
+            (ResultType.INPUT, np.dtype("float32"), (2, 2), "ABCD", None, "X"),
+            (
+                ResultType.RESULT,
+                np.dtype("float32"),
+                (2, 2),
+                "CEIO",
+                "LinearRegression",
+                "Y1",
+            ),
+            (ResultType.RESULT, np.dtype("float32"), (2, 3), "CEIP", "Abs", "Z"),
+            (ResultType.OUTPUT, np.dtype("float32"), (2, 2), "CEIP", None, "Y"),
+        ]
+        s1 = [ResultExecution(*s) for s in s1]
+        s2 = [ResultExecution(*s) for s in s2]
+
+        dc = DistanceExecution()
+        d, align = dc.distance_sequence(s1, s2)
+        self.assertEqual(d, 1008)
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)])
+        text = dc.to_str(s1, s2, align)
+        expected = """
+            =|INPUTfloat322x2ABCDA|INPUTfloat322x2ABCDA
+            =|INPUTfloat322x2ABCDB|INPUTfloat322x2ABCDB
+            ~|INPUTfloat322x3ABCDX|INPUTfloat322x2ABCDX
+            -|RESULTfloat322x2CEIOExpH|
+            =|RESULTfloat322x2CEIOLinearReY1|RESULTfloat322x2CEIOLinearReY1
+            ~|RESULTfloat322x2CEIOAbsY|RESULTfloat322x3CEIPAbsZ
+        """.replace(
+            "            ", ""
+        ).strip(
+            "\n "
+        )
+        self.assertEqual(expected, text.replace(" ", "").strip("\n"))
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_array_api/reference/__init__.py b/onnx_array_api/reference/__init__.py
index 6f6fc58..6fa2a33 100644
--- a/onnx_array_api/reference/__init__.py
+++ b/onnx_array_api/reference/__init__.py
@@ -11,7 +11,12 @@
 )
 from onnx.reference.op_run import to_array_extended
 from .evaluator import ExtendedReferenceEvaluator
-from .evaluator_yield import DistanceExecution, ResultType, YieldEvaluator
+from .evaluator_yield import (
+    DistanceExecution,
+    ResultExecution,
+    ResultType,
+    YieldEvaluator,
+)
 
 
 def from_array_extended(tensor: np.array, name: Optional[str] = None) -> TensorProto:
diff --git a/onnx_array_api/reference/evaluator_yield.py b/onnx_array_api/reference/evaluator_yield.py
index d347f39..e63feaf 100644
--- a/onnx_array_api/reference/evaluator_yield.py
+++ b/onnx_array_api/reference/evaluator_yield.py
@@ -1,3 +1,4 @@
+from dataclasses import dataclass
 from typing import Any, Dict, List, Iterator, Optional, Tuple
 from enum import IntEnum
 import numpy as np
@@ -5,6 +6,14 @@
 from .evaluator import ExtendedReferenceEvaluator
 
 
+def _align(res: str, limit: int) -> str:
+    if len(res) == limit:
+        return res
+    if len(res) > limit:
+        return res[:limit]
+    return res + " " * (limit - len(res))
+
+
 class ResultType(IntEnum):
     RESULT = 1
     INITIALIZER = 2
@@ -16,7 +25,47 @@ def __repr__(self):
         return f"{self.__class__.__name__}.{self._name_}"
 
 
-RESULT_TYPE = Tuple[ResultType, "np.dtype", Tuple[int, ...], str, str, str]
+@dataclass
+class ResultExecution:
+    """
+    The description of a result.
+    """
+
+    kind: ResultType
+    dtype: object
+    shape: tuple
+    summary: str
+    op_type: str
+    name: str
+
+    def __len__(self) -> int:
+        return 6
+
+    def __getitem__(self, i: int) -> Any:
+        if i == 0:
+            return self.kind
+        if i == 1:
+            return self.dtype
+        if i == 2:
+            return self.shape
+        if i == 3:
+            return self.summary
+        if i == 4:
+            return self.op_type
+        if i == 5:
+            return self.name
+        raise IndexError(f"i={i} out of boundary")
+
+    def __str__(self):
+        els = [
+            _align(self.kind._name_, 6),
+            _align(str(self.dtype).replace("dtype(", "").replace(")", ""), 8),
+            _align("x".join(map(str, self.shape)), 15),
+            self.summary,
+            _align(self.op_type or "", 8),
+            self.name or "",
+        ]
+        return " ".join(els)
 
 
 class YieldEvaluator:
@@ -101,27 +150,6 @@ def enumerate_results(
                 )
             yield ResultType.OUTPUT, name, results[name], None
 
-    def enumerate_summarized(
-        self,
-        output_names: Optional[List[str]] = None,
-        feed_inputs: Optional[Dict[str, Any]] = None,
-    ) -> Iterator[RESULT_TYPE]:
-        """
-        Executes the onnx model and enumerate intermediate results without their names.
-
-        Args:
-            output_names: requested outputs by names, None for all
-            feed_inputs: dictionary `{ input name: input value }`
-
-        Returns:
-            iterator on tuple(result kind, node.type, dtype, shape, value, result name)
-        """
-        for kind, name, value, op_type in self.enumerate_results(
-            output_names, feed_inputs
-        ):
-            summary = self.make_summary(value)
-            yield kind, value.dtype, value.shape, summary, op_type, name
-
     def make_summary(self, value: Any, length: int = 4, modulo: int = 26) -> str:
         """
         Create a short string summarizing the value (discretization).
@@ -147,6 +175,29 @@ def make_summary(self, value: Any, length: int = 4, modulo: int = 26) -> str:
         s = "".join([chr(65 + i) for i in value4i])
         return s
 
+    def enumerate_summarized(
+        self,
+        output_names: Optional[List[str]] = None,
+        feed_inputs: Optional[Dict[str, Any]] = None,
+    ) -> Iterator[ResultExecution]:
+        """
+        Executes the onnx model and enumerate intermediate results without their names.
+
+        Args:
+            output_names: requested outputs by names, None for all
+            feed_inputs: dictionary `{ input name: input value }`
+
+        Returns:
+            iterator on tuple(result kind, node.type, dtype, shape, value, result name)
+        """
+        for kind, name, value, op_type in self.enumerate_results(
+            output_names, feed_inputs
+        ):
+            summary = self.make_summary(value)
+            yield ResultExecution(
+                kind, value.dtype, value.shape, summary, op_type, name
+            )
+
 
 class DistanceExecution:
     """
@@ -170,7 +221,7 @@ def __init__(self, max_lag: int = 50):
         self.max_lag = max_lag
         self.insert_cost = 1000
 
-    def distance_pair(self, r1: RESULT_TYPE, r2: RESULT_TYPE) -> float:
+    def distance_pair(self, r1: ResultExecution, r2: ResultExecution) -> float:
         """
         (ResultType.RESULT, np.dtype("float32"), (2, 2), "CEIO", "Abs"),
 
@@ -214,7 +265,7 @@ def _cost_summary(self, s1: str, s2: str) -> float:
         return d
 
     def distance_sequence(
-        self, s1: List[RESULT_TYPE], s2: List[RESULT_TYPE]
+        self, s1: List[ResultExecution], s2: List[ResultExecution]
     ) -> Tuple[float, List[Tuple[int, int]]]:
         """
         Computes the distance between two sequences of results.
@@ -258,3 +309,46 @@ def distance_sequence(
             way.append(last)
             last = predecessor[last]
         return distance[len(s1) - 1, len(s2) - 1], list(reversed(way))[1:]
+
+    def to_str(
+        self,
+        s1: List[ResultExecution],
+        s2: List[ResultExecution],
+        alignment: List[Tuple[int, int]],
+        column_size: int = 50,
+    ) -> str:
+        """
+        Prints out the alignment between two sequences into a string.
+        :param s1: first sequence
+        :param s2: second sequence
+        :param alignment: alignment
+        :param column_size: column size
+        :return: test
+        """
+        rows = []
+        last = -1, -1
+        for i, j in alignment:
+            assert i < len(s1), f"Unexpected value i={i} >= len(s1)={len(s1)}"
+            assert j < len(s2), f"Unexpected value i={j} >= len(s2)={len(s2)}"
+            expected = last[0] + 1, last[1] + 1
+
+            if expected == (i, j):
+                d1 = s1[i]
+                d2 = s2[j]
+                d = self.distance_pair(d1, d2)
+                symbol = "=" if d == 0 else "~"
+                rows.append(
+                    f"{symbol} | {_align(str(d1), column_size)} | {_align(str(d2), column_size)}"
+                )
+            elif i == last[0]:
+                d2 = s2[j]
+                rows.append(
+                    f"+ | {_align('', column_size)} | {_align(str(d2), column_size)} "
+                )
+            else:
+                d1 = s1[i]
+                rows.append(
+                    f"- | {_align(str(d1), column_size)} | {_align('', column_size)}"
+                )
+            last = i, j
+        return "\n".join(rows)

From e1e30dcdc8a422ffad459fbcd8e7a0fc2bc48046 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Mon, 5 Feb 2024 16:30:37 +0100
Subject: [PATCH 07/13] compare function

---
 _doc/api/reference.rst                        | 17 +++-
 .../ut_reference/test_evaluator_yield.py      | 25 ++++++
 onnx_array_api/reference/__init__.py          |  1 +
 onnx_array_api/reference/evaluator_yield.py   | 89 ++++++++++++++++++-
 4 files changed, 129 insertions(+), 3 deletions(-)

diff --git a/_doc/api/reference.rst b/_doc/api/reference.rst
index a1507c8..7752ce6 100644
--- a/_doc/api/reference.rst
+++ b/_doc/api/reference.rst
@@ -7,12 +7,21 @@ ExtendedReferenceEvaluator
 .. autoclass:: onnx_array_api.reference.ExtendedReferenceEvaluator
     :members:
 
-YieldEvaluator
-++++++++++++++
+ResultType
+++++++++++
 
 .. autoclass:: onnx_array_api.reference.ResultType
     :members:
 
+ResultExecution
++++++++++++++++
+
+.. autoclass:: onnx_array_api.reference.ResultExecution
+    :members:
+
+YieldEvaluator
+++++++++++++++
+
 .. autoclass:: onnx_array_api.reference.YieldEvaluator
     :members:
 
@@ -22,3 +31,7 @@ DistanceExecution
 .. autoclass:: onnx_array_api.reference.DistanceExecution
     :members:
 
+compare_execution
++++++++++++++++++
+
+.. autofunction:: onnx_array_api.reference.compare_execution
diff --git a/_unittests/ut_reference/test_evaluator_yield.py b/_unittests/ut_reference/test_evaluator_yield.py
index d4ef8a6..57d39a4 100644
--- a/_unittests/ut_reference/test_evaluator_yield.py
+++ b/_unittests/ut_reference/test_evaluator_yield.py
@@ -9,12 +9,14 @@
     make_opsetid,
     make_tensor_value_info,
 )
+from onnx.parser import parse_model
 from onnx_array_api.ext_test_case import ExtTestCase
 from onnx_array_api.reference import (
     YieldEvaluator,
     ResultType,
     DistanceExecution,
     ResultExecution,
+    compare_onnx_execution,
 )
 
 
@@ -414,6 +416,29 @@ def test_distance_sequence_str(self):
         )
         self.assertEqual(expected, text.replace(" ", "").strip("\n"))
 
+    def test_compare_execution(self):
+        m1 = parse_model(
+            """
+            <ir_version: 8, opset_import: [ "": 18]>
+            agraph (float[N] x) => (float[N] z) {
+                two = Constant <value_float=2.0> ()
+                four = Add(two, two)
+                z = Mul(x, x)
+            }"""
+        )
+        m2 = parse_model(
+            """
+            <ir_version: 8, opset_import: [ "": 18]>
+            agraph (float[N] x) => (float[N] z) {
+                two = Constant <value_float=2.0> ()
+                z = Mul(x, x)
+            }"""
+        )
+        res1, res2, align, dc = compare_onnx_execution(m1, m2)
+        text = dc.to_str(res1, res2, align)
+        self.assertIn("CAAA Constant", text)
+        self.assertEqual(len(align), 4)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_array_api/reference/__init__.py b/onnx_array_api/reference/__init__.py
index 6fa2a33..fd1d27c 100644
--- a/onnx_array_api/reference/__init__.py
+++ b/onnx_array_api/reference/__init__.py
@@ -16,6 +16,7 @@
     ResultExecution,
     ResultType,
     YieldEvaluator,
+    compare_onnx_execution,
 )
 
 
diff --git a/onnx_array_api/reference/evaluator_yield.py b/onnx_array_api/reference/evaluator_yield.py
index e63feaf..cb9e5fb 100644
--- a/onnx_array_api/reference/evaluator_yield.py
+++ b/onnx_array_api/reference/evaluator_yield.py
@@ -2,7 +2,7 @@
 from typing import Any, Dict, List, Iterator, Optional, Tuple
 from enum import IntEnum
 import numpy as np
-from onnx import ModelProto
+from onnx import ModelProto, TensorProto, ValueInfoProto
 from .evaluator import ExtendedReferenceEvaluator
 
 
@@ -352,3 +352,90 @@ def to_str(
                 )
             last = i, j
         return "\n".join(rows)
+
+
+def generate_input(info: ValueInfoProto) -> np.ndarray:
+    """
+    Generates one input.
+    """
+    elem_type = info.type.tensor_type.elem_type
+    shape = [
+        (getattr(d, "dim_value", None) or getattr(d, "dim_param"))
+        for d in info.type.tensor_type.shape.dim
+    ]
+    new_shape = []
+    for sh in shape:
+        if isinstance(sh, str):
+            if len(new_shape) == 0:
+                new_shape.append(1)
+            else:
+                new_shape.append(16)
+        else:
+            new_shape.append(sh)
+    new_shape = tuple(new_shape)
+    p = np.prod(new_shape)
+    value = np.arange(p)
+    if elem_type == TensorProto.INT32:
+        return value.astype(np.int32).reshape(new_shape)
+    if elem_type == TensorProto.INT64:
+        return value.astype(np.int64).reshape(new_shape)
+    if elem_type == TensorProto.FLOAT:
+        return (value.astype(np.float32) / p).astype(np.float32).reshape(new_shape)
+    if elem_type == TensorProto.FLOAT16:
+        return (value.astype(np.float16) / p).astype(np.float16).reshape(new_shape)
+    if elem_type == TensorProto.FLOAT64:
+        return (value.astype(np.float64) / p).astype(np.float64).reshape(new_shape)
+    raise RuntimeError(f"Unexpected element_type {elem_type} for info={info}")
+
+
+def generate_inputs(model: ModelProto) -> List[np.ndarray]:
+    """
+    Generates inputs for a specific model.
+
+    :param model: ModelProto
+    :return: list of inputs
+    """
+    inputs = []
+    inits = set(i.name for i in model.graph.initializer)
+    for inp in model.graph.input:
+        if inp.name in inits:
+            break
+        inputs.append(generate_input(inp))
+    return inputs
+
+
+def compare_onnx_execution(
+    model1: ModelProto, model2: ModelProto, verbose: int = 0
+) -> Tuple[List[ResultExecution], List[ResultExecution], List[Tuple[int, int]]]:
+    """
+    Compares the execution of two onnx models.
+    The function assumes both models takes the same inputs.
+
+    :param model1: first model
+    :param model2: second model
+    :param verbose: verbosity
+    :return: four results, a sequence of results for the first model and the second model,
+        the alignment between the two, DistanceExecution
+    """
+    if verbose:
+        print("[compare_onnx_execution] generate inputs")
+    inputs = generate_inputs(model1)
+    feeds1 = {i.name: v for i, v in zip(model1.graph.input, inputs)}
+    feeds2 = {i.name: v for i, v in zip(model2.graph.input, inputs)}
+    if verbose:
+        print(f"[compare_onnx_execution] got {len(inputs)} inputs")
+        print("[compare_onnx_execution] execute first model")
+    res1 = list(YieldEvaluator(model1).enumerate_summarized(None, feeds1))
+    if verbose:
+        print(f"[compare_onnx_execution] got {len(res1)} results")
+        print("[compare_onnx_execution] execute second model")
+    res2 = list(YieldEvaluator(model2).enumerate_summarized(None, feeds2))
+    if verbose:
+        print(f"[compare_onnx_execution] got {len(res2)} results")
+        print("[compare_onnx_execution] compute edit distance")
+    dc = DistanceExecution()
+    _, align = dc.distance_sequence(res1, res2)
+    if verbose:
+        print(f"[compare_onnx_execution] got {len(align)} pairs")
+        print("[compare_onnx_execution] done")
+    return res1, res2, align, dc

From 12b103159e1ec382a4fcc4fed3483bdc90b14c5d Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Mon, 5 Feb 2024 18:19:13 +0100
Subject: [PATCH 08/13] fix FusedMatMul

---
 _doc/command_lines.rst                        | 52 +++++++++++++++
 _doc/index.rst                                |  1 +
 _unittests/ut_reference/test_array_tensor.py  | 26 +++++++-
 .../ut_reference/test_evaluator_yield.py      | 36 ++++++++---
 _unittests/ut_xrun_doc/test_command_lines1.py | 37 +++++++++++
 onnx_array_api/_command_lines_parser.py       | 58 ++++++++++++++++-
 onnx_array_api/reference/evaluator.py         |  2 +
 onnx_array_api/reference/evaluator_yield.py   | 64 ++++++++++---------
 .../reference/ops/op_fused_matmul.py          | 29 +++++++++
 9 files changed, 264 insertions(+), 41 deletions(-)
 create mode 100644 _doc/command_lines.rst
 create mode 100644 onnx_array_api/reference/ops/op_fused_matmul.py

diff --git a/_doc/command_lines.rst b/_doc/command_lines.rst
new file mode 100644
index 0000000..b7ff104
--- /dev/null
+++ b/_doc/command_lines.rst
@@ -0,0 +1,52 @@
+=============
+command lines
+=============
+
+compare
+=======
+
+The function convers an onnx file into some code.
+
+::
+
+    python -m compare -m1 model1.onnx -m2 model2.onnx -v 1
+
+Output example::
+
+    [compare_onnx_execution] got 2 inputs
+    [compare_onnx_execution] execute first model
+    [compare_onnx_execution] got 5 results
+    [compare_onnx_execution] execute second model
+    [compare_onnx_execution] got 5 results
+    [compare_onnx_execution] compute edit distance
+    [compare_onnx_execution] got 4 pairs
+    [compare_onnx_execution] done
+    = | INPUT  float32  5x6             AAAA          X    | INPUT  float32  5x6             AAAA          X   
+    = | INPUT  float32  5x6             AAAA          Y    | INPUT  float32  5x6             AAAA          Y   
+    = | RESULT float32  5x6             AABB Add      res  | RESULT float32  5x6             AABB Add      res 
+    = | RESULT float32  5x6             AAAA Cos      Z    | RESULT float32  5x6             AAAA Cos      Z 
+
+.. runpython::
+
+    from onnx_extended._command_lines_parser import get_parser_compare
+    get_parser_compare().print_help()
+
+See function :func:`onnx_array_api.reference.compare_onnx_execution`.
+
+translate
+=========
+
+The function convers an onnx file into some code.
+
+::
+
+    python -m translate ...
+
+Output example::
+
+    not yet ready  
+
+.. runpython::
+
+    from onnx_extended._command_lines_parser import get_parser_translate
+    get_parser_translate().print_help()
diff --git a/_doc/index.rst b/_doc/index.rst
index 02c4eed..b81be4f 100644
--- a/_doc/index.rst
+++ b/_doc/index.rst
@@ -36,6 +36,7 @@ The objective is to speed up the implementation of converter libraries.
     tutorial/index
     api/index
     tech/index
+    command_lines
     auto_examples/index
 
 .. toctree::
diff --git a/_unittests/ut_reference/test_array_tensor.py b/_unittests/ut_reference/test_array_tensor.py
index 59fe5f1..f13c3e5 100644
--- a/_unittests/ut_reference/test_array_tensor.py
+++ b/_unittests/ut_reference/test_array_tensor.py
@@ -1,7 +1,13 @@
 import unittest
 import numpy as np
 from onnx import TensorProto
-from onnx.helper import make_graph, make_model, make_node, make_tensor_value_info
+from onnx.helper import (
+    make_graph,
+    make_model,
+    make_node,
+    make_tensor_value_info,
+    make_opsetid,
+)
 from onnx_array_api.ext_test_case import ExtTestCase
 from onnx_array_api.reference import (
     to_array_extended,
@@ -51,6 +57,24 @@ def make_model_f8(fr, to):
                         back = from_array_extended(got, "a")
                         self.assertEqual(to, back.data_type)
 
+    def test_fused_matmul(self):
+        model = make_model(
+            make_graph(
+                [make_node("FusedMatMul", ["X", "Y"], ["Z"], domain="com.microsoft")],
+                "name",
+                [
+                    make_tensor_value_info("X", TensorProto.FLOAT, None),
+                    make_tensor_value_info("Y", TensorProto.FLOAT, None),
+                ],
+                [make_tensor_value_info("Z", TensorProto.FLOAT, None)],
+            ),
+            opset_imports=[make_opsetid("", 18), make_opsetid("com.microsoft", 1)],
+        )
+        ref = ExtendedReferenceEvaluator(model)
+        a = np.arange(4).reshape(-1, 2)
+        got = ref.run(None, {"X": a, "Y": a})
+        self.assertEqualArray(a @ a, got[0])
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_reference/test_evaluator_yield.py b/_unittests/ut_reference/test_evaluator_yield.py
index 57d39a4..8565628 100644
--- a/_unittests/ut_reference/test_evaluator_yield.py
+++ b/_unittests/ut_reference/test_evaluator_yield.py
@@ -18,9 +18,25 @@
     ResultExecution,
     compare_onnx_execution,
 )
+from onnx_array_api.reference.evaluator_yield import make_summary
 
 
 class TestArrayTensor(ExtTestCase):
+    def test_make_summary(self):
+        a = np.arange(12).reshape(3, 4)
+        v = make_summary(a)
+        self.assertEqual(v, "DMVE")
+        a = np.arange(12)
+        v = make_summary(a)
+        self.assertEqual(v, "DMVE")
+        a = np.arange(12).astype(np.float32)
+        v = make_summary(a)
+        self.assertEqual(v, "DMVE")
+        a = np.arange(13)
+        a[-1] = 0
+        v = make_summary(a)
+        self.assertEqual(v, "GWMA")
+
     def test_evaluator_yield(self):
         new_domain = "custom_domain"
         opset_imports = [make_opsetid("", 14), make_opsetid(new_domain, 1)]
@@ -210,7 +226,7 @@ def test_distance_sequence_0(self):
         dc = DistanceExecution()
         d, align = dc.distance_sequence(expected, expected)
         self.assertEqual(d, 0)
-        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)])
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)])
 
     def test_distance_sequence_ins(self):
         s1 = [
@@ -246,10 +262,10 @@ def test_distance_sequence_ins(self):
         dc = DistanceExecution()
         d, align = dc.distance_sequence(s1, s2)
         self.assertEqual(d, dc.insert_cost)
-        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 3)])
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 3), (5, 4)])
         d, align = dc.distance_sequence(s2, s1)
         self.assertEqual(d, dc.insert_cost)
-        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (3, 4)])
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (3, 4), (4, 5)])
 
     def test_distance_sequence_equal(self):
         s1 = [
@@ -286,7 +302,7 @@ def test_distance_sequence_equal(self):
         dc = DistanceExecution()
         d, align = dc.distance_sequence(s1, s2)
         self.assertEqual(d, 0)
-        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)])
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)])
 
     def test_distance_sequence_diff(self):
         s1 = [
@@ -323,7 +339,7 @@ def test_distance_sequence_diff(self):
         dc = DistanceExecution()
         d, align = dc.distance_sequence(s1, s2)
         self.assertEqual(d, 1)
-        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)])
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)])
 
     def test_distance_sequence_diff2(self):
         s1 = [
@@ -360,7 +376,7 @@ def test_distance_sequence_diff2(self):
         dc = DistanceExecution()
         d, align = dc.distance_sequence(s1, s2)
         self.assertEqual(d, 5)
-        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)])
+        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)])
 
     def test_distance_sequence_str(self):
         s1 = [
@@ -400,8 +416,11 @@ def test_distance_sequence_str(self):
         dc = DistanceExecution()
         d, align = dc.distance_sequence(s1, s2)
         self.assertEqual(d, 1008)
-        self.assertEqual(align, [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)])
+        self.assertEqual(
+            align, [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4), (6, 5)]
+        )
         text = dc.to_str(s1, s2, align)
+        self.assertIn("OUTPUT", text)
         expected = """
             =|INPUTfloat322x2ABCDA|INPUTfloat322x2ABCDA
             =|INPUTfloat322x2ABCDB|INPUTfloat322x2ABCDB
@@ -409,6 +428,7 @@ def test_distance_sequence_str(self):
             -|RESULTfloat322x2CEIOExpH|
             =|RESULTfloat322x2CEIOLinearReY1|RESULTfloat322x2CEIOLinearReY1
             ~|RESULTfloat322x2CEIOAbsY|RESULTfloat322x3CEIPAbsZ
+            ~|OUTPUTfloat322x2CEIOY|OUTPUTfloat322x2CEIPY
         """.replace(
             "            ", ""
         ).strip(
@@ -437,7 +457,7 @@ def test_compare_execution(self):
         res1, res2, align, dc = compare_onnx_execution(m1, m2)
         text = dc.to_str(res1, res2, align)
         self.assertIn("CAAA Constant", text)
-        self.assertEqual(len(align), 4)
+        self.assertEqual(len(align), 5)
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_xrun_doc/test_command_lines1.py b/_unittests/ut_xrun_doc/test_command_lines1.py
index 8aa17ee..1cd16bb 100644
--- a/_unittests/ut_xrun_doc/test_command_lines1.py
+++ b/_unittests/ut_xrun_doc/test_command_lines1.py
@@ -14,6 +14,7 @@
 from onnx_array_api.ext_test_case import ExtTestCase
 from onnx_array_api._command_lines_parser import (
     get_main_parser,
+    get_parser_compare,
     get_parser_translate,
     main,
 )
@@ -70,6 +71,42 @@ def test_command_translate(self):
             code = st.getvalue()
             self.assertIn("start(opset=", code)
 
+    def test_parser_compare(self):
+        st = StringIO()
+        with redirect_stdout(st):
+            get_parser_compare().print_help()
+        text = st.getvalue()
+        self.assertIn("model1", text)
+
+    def test_command_compare(self):
+        X = make_tensor_value_info("X", TensorProto.FLOAT, [5, 6])
+        Y = make_tensor_value_info("Y", TensorProto.FLOAT, [5, 6])
+        Z = make_tensor_value_info("Z", TensorProto.FLOAT, [5, 6])
+        graph = make_graph(
+            [
+                make_node("Add", ["X", "Y"], ["res"]),
+                make_node("Cos", ["res"], ["Z"]),
+            ],
+            "g",
+            [X, Y],
+            [Z],
+        )
+        onnx_model = make_model(graph, opset_imports=[make_opsetid("", 18)])
+
+        with tempfile.TemporaryDirectory() as root:
+            model_file = os.path.join(root, "model.onnx")
+            with open(model_file, "wb") as f:
+                f.write(onnx_model.SerializeToString())
+
+            args = ["compare", "-m1", model_file, "-m2", model_file, "-v", "1"]
+            st = StringIO()
+            with redirect_stdout(st):
+                main(args)
+
+            code = st.getvalue()
+            self.assertIn("[compare_onnx_execution]", code)
+            self.assertIn("AAAA", code)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_array_api/_command_lines_parser.py b/onnx_array_api/_command_lines_parser.py
index 71f5a35..a180deb 100644
--- a/onnx_array_api/_command_lines_parser.py
+++ b/onnx_array_api/_command_lines_parser.py
@@ -14,12 +14,13 @@ def get_main_parser() -> ArgumentParser:
     )
     parser.add_argument(
         "cmd",
-        choices=["translate"],
+        choices=["translate", "compare"],
         help=dedent(
             """
         Selects a command.
         
-        'translate' exports an onnx graph into a piece of code replicating it.
+        'translate' exports an onnx graph into a piece of code replicating it,
+        'compares' compares the execution of two onnx models
         """
         ),
     )
@@ -65,8 +66,59 @@ def _cmd_translate(argv: List[Any]):
     print(code)
 
 
+def get_parser_compare() -> ArgumentParser:
+    parser = ArgumentParser(
+        prog="compare",
+        description=dedent(
+            """
+        Compares the execution of two onnx models.
+        """
+        ),
+        epilog="This is used when two models are different but should produce the same results.",
+    )
+    parser.add_argument(
+        "-m1",
+        "--model1",
+        type=str,
+        required=True,
+        help="first onnx model",
+    )
+    parser.add_argument(
+        "-m2",
+        "--model2",
+        type=str,
+        required=True,
+        help="second onnx model",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        default=0,
+        help="verbosity",
+    )
+    parser.add_argument(
+        "-c",
+        "--column-size",
+        default=50,
+        help="column size when displaying the results",
+    )
+    return parser
+
+
+def _cmd_compare(argv: List[Any]):
+    from .reference import compare_onnx_execution
+
+    parser = get_parser_compare()
+    args = parser.parse_args(argv[1:])
+    onx1 = onnx.load(args.model1)
+    onx2 = onnx.load(args.model2)
+    res1, res2, align, dc = compare_onnx_execution(onx1, onx2, verbose=args.verbose)
+    text = dc.to_str(res1, res2, align, column_size=args.column_size)
+    print(text)
+
+
 def main(argv: Optional[List[Any]] = None):
-    fcts = dict(translate=_cmd_translate)
+    fcts = dict(translate=_cmd_translate, compare=_cmd_compare)
 
     if argv is None:
         argv = sys.argv[1:]
diff --git a/onnx_array_api/reference/evaluator.py b/onnx_array_api/reference/evaluator.py
index e20be76..54f0c26 100644
--- a/onnx_array_api/reference/evaluator.py
+++ b/onnx_array_api/reference/evaluator.py
@@ -7,6 +7,7 @@
 from .ops.op_cast_like import CastLike_15, CastLike_19
 from .ops.op_concat import Concat
 from .ops.op_constant_of_shape import ConstantOfShape
+from .ops.op_fused_matmul import FusedMatMul
 
 
 logger = getLogger("onnx-array-api-eval")
@@ -32,6 +33,7 @@ class ExtendedReferenceEvaluator(ReferenceEvaluator):
         CastLike_15,
         CastLike_19,
         ConstantOfShape,
+        FusedMatMul,
     ]
 
     @staticmethod
diff --git a/onnx_array_api/reference/evaluator_yield.py b/onnx_array_api/reference/evaluator_yield.py
index cb9e5fb..9c77deb 100644
--- a/onnx_array_api/reference/evaluator_yield.py
+++ b/onnx_array_api/reference/evaluator_yield.py
@@ -68,6 +68,32 @@ def __str__(self):
         return " ".join(els)
 
 
+def make_summary(value: Any, length: int = 4, modulo: int = 26) -> str:
+    """
+    Create a short string summarizing the value (discretization).
+
+    :param value: array
+    :param length: number of value to produce
+    :param module: discretization parameter
+    :return: short string
+    """
+    value4 = np.zeros(length, dtype=np.float64)
+    if value.size <= length:
+        value4[: value.size] = value.flatten().astype(np.float64)
+    else:
+        if value.size % length != 0:
+            value2 = np.zeros(
+                value.size + length - value.size % length, dtype=np.float64
+            )
+            value2[: value.size] = value.flatten().astype(np.float64)
+        else:
+            value2 = value.flatten().astype(np.float64)
+        value4 = value2.reshape((4, -1)).sum(axis=1)
+    value4i = value4.astype(np.int64) % modulo
+    s = "".join([chr(65 + i) for i in value4i])
+    return s
+
+
 class YieldEvaluator:
     """
     This class implements method `enumerate_results` which iterates on
@@ -150,31 +176,6 @@ def enumerate_results(
                 )
             yield ResultType.OUTPUT, name, results[name], None
 
-    def make_summary(self, value: Any, length: int = 4, modulo: int = 26) -> str:
-        """
-        Create a short string summarizing the value (discretization).
-
-        :param value: array
-        :param length: number of value to produce
-        :param module: discretization parameter
-        :return: short string
-        """
-        value4 = np.zeros(4, dtype=np.float64)
-        if value.size <= length:
-            value4[: value.size] = value.flatten().astype(np.float64)
-        else:
-            if value.size % length != 2:
-                value2 = np.zeros(
-                    value.size + length - value.size % length, dtype=np.float64
-                )
-                value2[: value.size] = value.flatten().astype(np.float64)
-            else:
-                value2 = value.flatten().astype(np.float64)
-            value4 = value2.reshape((4, -1)).mean(axis=1)
-        value4i = value4.astype(np.int64) % modulo
-        s = "".join([chr(65 + i) for i in value4i])
-        return s
-
     def enumerate_summarized(
         self,
         output_names: Optional[List[str]] = None,
@@ -193,7 +194,7 @@ def enumerate_summarized(
         for kind, name, value, op_type in self.enumerate_results(
             output_names, feed_inputs
         ):
-            summary = self.make_summary(value)
+            summary = make_summary(value)
             yield ResultExecution(
                 kind, value.dtype, value.shape, summary, op_type, name
             )
@@ -304,7 +305,7 @@ def distance_sequence(
 
         # reverse
         way = []
-        last = predecessor[len(s1) - 1, len(s2) - 1]
+        last = len(s1) - 1, len(s2) - 1
         while last is not None:
             way.append(last)
             last = predecessor[last]
@@ -405,7 +406,10 @@ def generate_inputs(model: ModelProto) -> List[np.ndarray]:
 
 
 def compare_onnx_execution(
-    model1: ModelProto, model2: ModelProto, verbose: int = 0
+    model1: ModelProto,
+    model2: ModelProto,
+    inputs: Optional[List[Any]] = None,
+    verbose: int = 0,
 ) -> Tuple[List[ResultExecution], List[ResultExecution], List[Tuple[int, int]]]:
     """
     Compares the execution of two onnx models.
@@ -413,13 +417,15 @@ def compare_onnx_execution(
 
     :param model1: first model
     :param model2: second model
+    :param inputs: inputs to use
     :param verbose: verbosity
     :return: four results, a sequence of results for the first model and the second model,
         the alignment between the two, DistanceExecution
     """
     if verbose:
         print("[compare_onnx_execution] generate inputs")
-    inputs = generate_inputs(model1)
+    if inputs is None:
+        inputs = generate_inputs(model1)
     feeds1 = {i.name: v for i, v in zip(model1.graph.input, inputs)}
     feeds2 = {i.name: v for i, v in zip(model2.graph.input, inputs)}
     if verbose:
diff --git a/onnx_array_api/reference/ops/op_fused_matmul.py b/onnx_array_api/reference/ops/op_fused_matmul.py
new file mode 100644
index 0000000..0f5b495
--- /dev/null
+++ b/onnx_array_api/reference/ops/op_fused_matmul.py
@@ -0,0 +1,29 @@
+from onnx.reference.op_run import OpRun
+
+
+class FusedMatMul(OpRun):
+    op_domain = "com.microsoft"
+
+    def _run(
+        self,
+        A,
+        B,
+        alpha: float = 0,
+        transA: int = 0,
+        transB: int = 0,
+        transBatchA: int = 0,
+        transBatchB: int = 0,
+    ):
+        assert (
+            transBatchA == 0
+        ), f"Not implemented for transBatchA==1 and {A.shape}x{B.shape}"
+        assert (
+            transBatchB == 0
+        ), f"Not implemented for transBatchB==1 and {A.shape}x{B.shape}"
+        if transA:
+            dim = len(A.shape)
+            A = A.transpose(axes=(dim - 2, dim - 1))
+        if transB:
+            dim = len(B.shape)
+            B = B.transpose(axes=(dim - 2, dim - 1))
+        return (A @ B,)

From 09863991172c330ff41c95d790161c4bac0d2c63 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Mon, 5 Feb 2024 23:43:58 +0100
Subject: [PATCH 09/13] fix alpha

---
 onnx_array_api/reference/ops/op_fused_matmul.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnx_array_api/reference/ops/op_fused_matmul.py b/onnx_array_api/reference/ops/op_fused_matmul.py
index 0f5b495..9bafd96 100644
--- a/onnx_array_api/reference/ops/op_fused_matmul.py
+++ b/onnx_array_api/reference/ops/op_fused_matmul.py
@@ -1,3 +1,4 @@
+import numpy as np
 from onnx.reference.op_run import OpRun
 
 
@@ -26,4 +27,5 @@ def _run(
         if transB:
             dim = len(B.shape)
             B = B.transpose(axes=(dim - 2, dim - 1))
-        return (A @ B,)
+        a = np.array(alpha, dtype=A.dtype)
+        return (A @ B * a,)

From d1f276b67608e7d9ec413c01f8f0dcf621c37e65 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Tue, 6 Feb 2024 00:46:01 +0100
Subject: [PATCH 10/13] example

---
 _doc/examples/plot_onnx_diff.py               | 68 +++++++++++++++++++
 _unittests/ut_xrun_doc/test_command_lines1.py |  2 +-
 onnx_array_api/reference/evaluator_yield.py   |  8 ++-
 .../reference/ops/op_fused_matmul.py          |  2 +-
 4 files changed, 75 insertions(+), 5 deletions(-)
 create mode 100644 _doc/examples/plot_onnx_diff.py

diff --git a/_doc/examples/plot_onnx_diff.py b/_doc/examples/plot_onnx_diff.py
new file mode 100644
index 0000000..7a5f1d3
--- /dev/null
+++ b/_doc/examples/plot_onnx_diff.py
@@ -0,0 +1,68 @@
+"""
+
+.. _l-onnx-diff-example:
+
+Compares the conversions of the same model with different options
+=================================================================
+
+The script compares two onnx models obtained with the same trained
+scikit-learn models but converted with different options.
+
+A model
++++++++
+"""
+
+from sklearn.mixture import GaussianMixture
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from skl2onnx import to_onnx
+from onnx_array_api.reference import compare_onnx_execution
+from onnx_array_api.plotting.text_plot import onnx_simple_text_plot
+
+
+data = load_iris()
+X_train, X_test = train_test_split(data.data)
+model = GaussianMixture()
+model.fit(X_train)
+
+#################################
+# Conversion to onnx
+# ++++++++++++++++++
+
+onx = to_onnx(
+    model, X_train[:1], options={id(model): {"score_samples": True}}, target_opset=12
+)
+
+print(onnx_simple_text_plot(onx))
+
+##################################
+# Conversion to onnx without ReduceLogSumExp
+# ++++++++++++++++++++++++++++++++++++++++++
+
+onx2 = to_onnx(
+    model,
+    X_train[:1],
+    options={id(model): {"score_samples": True}},
+    black_op={"ReduceLogSumExp"},
+    target_opset=12,
+)
+
+print(onnx_simple_text_plot(onx2))
+
+
+#############################################
+# Differences
+# +++++++++++
+#
+# Function :func:`onnx_array_api.reference.compare_onnx_execution`
+# compares the intermediate results of two onnx models. Then it finds
+# the best alignmet between the two models using an edit distance.
+
+res1, res2, align, dc = compare_onnx_execution(onx, onx2, verbose=1)
+print("------------")
+text = dc.to_str(res1, res2, align)
+print(text)
+
+###############################
+# The display shows that ReduceSumSquare was replaced by Mul + ReduceSum,
+# and ReduceLogSumExp by ReduceMax + Sub + Exp + Log + Add.
diff --git a/_unittests/ut_xrun_doc/test_command_lines1.py b/_unittests/ut_xrun_doc/test_command_lines1.py
index 1cd16bb..02f84bd 100644
--- a/_unittests/ut_xrun_doc/test_command_lines1.py
+++ b/_unittests/ut_xrun_doc/test_command_lines1.py
@@ -105,7 +105,7 @@ def test_command_compare(self):
 
             code = st.getvalue()
             self.assertIn("[compare_onnx_execution]", code)
-            self.assertIn("AAAA", code)
+            self.assertIn("ADFF", code)
 
 
 if __name__ == "__main__":
diff --git a/onnx_array_api/reference/evaluator_yield.py b/onnx_array_api/reference/evaluator_yield.py
index 9c77deb..3935913 100644
--- a/onnx_array_api/reference/evaluator_yield.py
+++ b/onnx_array_api/reference/evaluator_yield.py
@@ -62,7 +62,7 @@ def __str__(self):
             _align(str(self.dtype).replace("dtype(", "").replace(")", ""), 8),
             _align("x".join(map(str, self.shape)), 15),
             self.summary,
-            _align(self.op_type or "", 8),
+            _align(self.op_type or "", 10),
             self.name or "",
         ]
         return " ".join(els)
@@ -316,7 +316,7 @@ def to_str(
         s1: List[ResultExecution],
         s2: List[ResultExecution],
         alignment: List[Tuple[int, int]],
-        column_size: int = 50,
+        column_size: int = 60,
     ) -> str:
         """
         Prints out the alignment between two sequences into a string.
@@ -384,7 +384,7 @@ def generate_input(info: ValueInfoProto) -> np.ndarray:
         return (value.astype(np.float32) / p).astype(np.float32).reshape(new_shape)
     if elem_type == TensorProto.FLOAT16:
         return (value.astype(np.float16) / p).astype(np.float16).reshape(new_shape)
-    if elem_type == TensorProto.FLOAT64:
+    if elem_type == TensorProto.DOUBLE:
         return (value.astype(np.float64) / p).astype(np.float64).reshape(new_shape)
     raise RuntimeError(f"Unexpected element_type {elem_type} for info={info}")
 
@@ -414,6 +414,8 @@ def compare_onnx_execution(
     """
     Compares the execution of two onnx models.
     The function assumes both models takes the same inputs.
+    See :ref:`l-onnx-diff-example` to see a full example using
+    this function.
 
     :param model1: first model
     :param model2: second model
diff --git a/onnx_array_api/reference/ops/op_fused_matmul.py b/onnx_array_api/reference/ops/op_fused_matmul.py
index 9bafd96..0f738c7 100644
--- a/onnx_array_api/reference/ops/op_fused_matmul.py
+++ b/onnx_array_api/reference/ops/op_fused_matmul.py
@@ -9,7 +9,7 @@ def _run(
         self,
         A,
         B,
-        alpha: float = 0,
+        alpha: float = 1,
         transA: int = 0,
         transB: int = 0,
         transBatchA: int = 0,

From 42c5e695a9f992af53154caaf192b0f3c10c4442 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Tue, 6 Feb 2024 00:51:08 +0100
Subject: [PATCH 11/13] documentation

---
 _doc/tutorial/index.rst |  1 +
 _doc/tutorial/tools.rst | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 _doc/tutorial/tools.rst

diff --git a/_doc/tutorial/index.rst b/_doc/tutorial/index.rst
index f4cce00..9fcc557 100644
--- a/_doc/tutorial/index.rst
+++ b/_doc/tutorial/index.rst
@@ -10,4 +10,5 @@ Tutorial
     graph_api
     light_api
     numpy_api
+    tools
     benchmarks
diff --git a/_doc/tutorial/tools.rst b/_doc/tutorial/tools.rst
new file mode 100644
index 0000000..fe673f7
--- /dev/null
+++ b/_doc/tutorial/tools.rst
@@ -0,0 +1,20 @@
+=====
+Tools
+=====
+
+Some of useful tools.
+
+Text representation
+===================
+
+Plotting a graph is great but difficult to read when
+the graph is big and it is slow.
+:func:`onnx_array_api.plotting.text_plot.onnx_simple_text_plot`
+prints out a text representation.
+
+Differences between two models
+==============================
+
+How to understand the differences between two models
+assuming they are producing the same outputs?
+Example :ref:`l-onnx-diff-example` shows how to do it.

From e2224414336de7b86c2ec517c773127378717ea0 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Tue, 6 Feb 2024 00:52:11 +0100
Subject: [PATCH 12/13] fix length

---
 _unittests/ut_reference/test_evaluator_yield.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_unittests/ut_reference/test_evaluator_yield.py b/_unittests/ut_reference/test_evaluator_yield.py
index 8565628..7181456 100644
--- a/_unittests/ut_reference/test_evaluator_yield.py
+++ b/_unittests/ut_reference/test_evaluator_yield.py
@@ -426,7 +426,7 @@ def test_distance_sequence_str(self):
             =|INPUTfloat322x2ABCDB|INPUTfloat322x2ABCDB
             ~|INPUTfloat322x3ABCDX|INPUTfloat322x2ABCDX
             -|RESULTfloat322x2CEIOExpH|
-            =|RESULTfloat322x2CEIOLinearReY1|RESULTfloat322x2CEIOLinearReY1
+            =|RESULTfloat322x2CEIOLinearRegrY1|RESULTfloat322x2CEIOLinearRegrY1
             ~|RESULTfloat322x2CEIOAbsY|RESULTfloat322x3CEIPAbsZ
             ~|OUTPUTfloat322x2CEIOY|OUTPUTfloat322x2CEIPY
         """.replace(

From 04418d4b584139c08788735b0da4ded4abd24c49 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Tue, 6 Feb 2024 01:25:44 +0100
Subject: [PATCH 13/13] doc

---
 _doc/api/reference.rst | 6 +++---
 _doc/command_lines.rst | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/_doc/api/reference.rst b/_doc/api/reference.rst
index 7752ce6..3b4ae7d 100644
--- a/_doc/api/reference.rst
+++ b/_doc/api/reference.rst
@@ -31,7 +31,7 @@ DistanceExecution
 .. autoclass:: onnx_array_api.reference.DistanceExecution
     :members:
 
-compare_execution
-+++++++++++++++++
+compare_onnx_execution
+++++++++++++++++++++++
 
-.. autofunction:: onnx_array_api.reference.compare_execution
+.. autofunction:: onnx_array_api.reference.compare_onnx_execution
diff --git a/_doc/command_lines.rst b/_doc/command_lines.rst
index b7ff104..38ca5f2 100644
--- a/_doc/command_lines.rst
+++ b/_doc/command_lines.rst
@@ -28,7 +28,7 @@ Output example::
 
 .. runpython::
 
-    from onnx_extended._command_lines_parser import get_parser_compare
+    from onnx_array_api._command_lines_parser import get_parser_compare
     get_parser_compare().print_help()
 
 See function :func:`onnx_array_api.reference.compare_onnx_execution`.
@@ -48,5 +48,5 @@ Output example::
 
 .. runpython::
 
-    from onnx_extended._command_lines_parser import get_parser_translate
+    from onnx_array_api._command_lines_parser import get_parser_translate
     get_parser_translate().print_help()