diff --git a/pygsti/baseobjs/label.py b/pygsti/baseobjs/label.py
index db1d92745..254b9b03c 100644
--- a/pygsti/baseobjs/label.py
+++ b/pygsti/baseobjs/label.py
@@ -13,6 +13,7 @@
 import itertools as _itertools
 import numbers as _numbers
 import sys as _sys
+import copy as _copy
 import numpy as _np
 
 
@@ -138,7 +139,8 @@ def __new__(cls, name, state_space_labels=None, time=None, args=None):
                 return LabelStr.init(name, time)
 
         else:
-            if args is not None: return LabelTupWithArgs.init(name, state_space_labels, time, args)
+            if args is not None:
+                return LabelTupWithArgs.init(name, state_space_labels, time, args)
             else:
                 if time == 0.0:
                     return LabelTup.init(name, state_space_labels)
@@ -200,6 +202,8 @@ def is_simple(self):
 
         return self.IS_SIMPLE
 
+    def copy(self):
+        return _copy.deepcopy(self)
 
 
 class LabelTup(Label, tuple):
diff --git a/pygsti/baseobjs/statespace.py b/pygsti/baseobjs/statespace.py
index 277edb0f4..d0b13793b 100644
--- a/pygsti/baseobjs/statespace.py
+++ b/pygsti/baseobjs/statespace.py
@@ -683,6 +683,12 @@ def __getstate__(self):
     def __setstate__(self, state_dict):
         for k, v in state_dict.items():
             self.__dict__[k] = v
+            try:
+                _ = self.__getattribute__(k)
+            except AttributeError:
+                _ = self.__dict__.pop(k)
+                self.__dict__['_' + k] = v
+                _ = self.__getattribute__(k)
         #reinitialize the hash
         self._hash = hash((self.tensor_product_blocks_labels,
                            self.tensor_product_blocks_dimensions,
diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index 8cbeef237..45bb87cd9 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -9,13 +9,14 @@
 # in compliance with the License.  You may obtain a copy of the License at
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
+
 from __future__ import annotations
-import collections as _collections
 import itertools as _itertools
 import warnings as _warnings
+from typing import List, Sequence, Literal, Tuple, Any, Hashable, Optional, TypeAlias
 
 import numpy as _np
-from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel
+from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel, LabelTupTup as _LabelTupTup
 
 from pygsti.baseobjs import outcomelabeldict as _ld, _compatibility as _compat
 from pygsti.tools import internalgates as _itgs
@@ -47,6 +48,19 @@
      +' unitary from standard_gatename_unitaries which matches up to a global phase.'
 _warnings.filterwarnings('module', message=msg, category=UserWarning)
 
+
+##############################################################################################
+# NOTE(Riley): these types are work-in-progress. They don't make a whole lot of sense to me
+# right now. It might be possible that they just _DONT_ make sense, and yet they're correct
+# in the context of the current implementation.
+_NestedLabelSeq = List[_Label | Sequence[_Label]]
+#   ^ An alias to make it easier to see how subsequent types relate.
+#     Don't use this in function signatures.
+LayerTupLike = Tuple[_LabelTupTup,    ...] | _NestedLabelSeq | Tuple[_Label, ...]
+LabelsLike   = Tuple[_NestedLabelSeq, ...] | _NestedLabelSeq
+##############################################################################################
+
+
 def _np_to_quil_def_str(name, input_array):
     """
     Write a DEFGATE block for RQC quil for an arbitrary one- or two-qubit unitary gate.
@@ -185,7 +199,7 @@ def to_label(x):
     """
     if isinstance(x, _Label): return x
     # # do this manually when desired, as it "boxes" a circuit being inserted
-    #elif isinstance(x,Circuit): return x.to_circuit_label()
+    elif isinstance(x,Circuit): return x.to_label()
     else: return _Label(x)
 
 
@@ -271,9 +285,11 @@ def from_tuple(cls, tup):
         else:
             return cls(tup)
 
-    def __init__(self, layer_labels=(), line_labels='auto', num_lines=None, editable=False,
-                 stringrep=None, name='', check=True, expand_subcircuits="default",
-                 occurrence=None, compilable_layer_indices=None):
+    def __init__(self,
+            layer_labels=(), line_labels: str|Tuple[Any,...] = 'auto', num_lines: Optional[int] = None,
+            editable=False, stringrep=None, name='', check=True, expand_subcircuits="default", occurrence=None,
+            compilable_layer_indices=None
+        ):
         """
         Creates a new Circuit object, encapsulating a quantum circuit.
 
@@ -517,7 +533,7 @@ def _fastinit(cls, labels, line_labels, editable, name='', stringrep=None, occur
     #Note: If editing _bare_init one should also check _copy_init in case changes must be propagated.
     def _bare_init(self, labels, line_labels, editable, name='', stringrep=None, occurrence=None,
                    compilable_layer_indices_tup=()):
-        self._labels = labels
+        self._labels : LabelsLike = labels
         self._line_labels = tuple(line_labels)
         self._occurrence_id = occurrence
         self._compilable_layer_indices_tup = compilable_layer_indices_tup # always a tuple, but can be empty.
@@ -531,11 +547,13 @@ def _bare_init(self, labels, line_labels, editable, name='', stringrep=None, occ
         self._name = name  # can be None
         #self._times = None  # for FUTURE expansion
         self.auxinfo = {}  # for FUTURE expansion / user metadata
+        self.saved_auxinfo = {}
+        self.saved_auxinfo["lanes"] = {tuple(line_labels): self._labels}
 
     #Note: If editing _copy_init one should also check _bare_init in case changes must be propagated.
     #specialized codepath for copying
     def _copy_init(self, labels, line_labels, editable, name='', stringrep=None, occurrence=None,
-                compilable_layer_indices_tup=(), hashable_tup=None, precomp_hash=None):
+                compilable_layer_indices_tup=(), hashable_tup=None, precomp_hash=None, saved_aux: dict[str, dict]={}):
         self._labels = labels
         self._line_labels = line_labels
         self._occurrence_id = occurrence
@@ -551,6 +569,9 @@ def _copy_init(self, labels, line_labels, editable, name='', stringrep=None, occ
         #self._times = None  # for FUTURE expansion
         self.auxinfo = {}  # for FUTURE expansion / user metadata
 
+
+        self.saved_auxinfo = saved_aux
+
         return self
     
     #pickle management functions
@@ -647,7 +668,7 @@ def occurrence(self, value):
         self._str = None  # regenerate string rep (it may have updated)
 
     @property
-    def layertup(self):
+    def layertup(self) -> LayerTupLike:
         """
         This Circuit's layers as a standard Python tuple of layer Labels.
 
@@ -1024,7 +1045,7 @@ def num_lines(self):
         """
         return len(self._line_labels)
     
-    def copy(self, editable='auto'):
+    def copy(self, editable: bool | Literal['auto'] = 'auto'):
         """
         Returns a copy of the circuit.
 
@@ -1051,13 +1072,13 @@ def copy(self, editable='auto'):
                 editable_labels =[[lbl] if lbl.IS_SIMPLE else list(lbl.components) for lbl in self._labels]
                 return ret._copy_init(editable_labels, self._line_labels, editable, 
                                       self._name, self._str, self._occurrence_id, 
-                                      self._compilable_layer_indices_tup)
+                                      self._compilable_layer_indices_tup, saved_aux=self.saved_auxinfo)
             else:
                 #copy the editable labels (avoiding shallow copy issues)
                 editable_labels = [sublist.copy() for sublist in self._labels]
                 return ret._copy_init(editable_labels, self._line_labels, editable, 
                                       self._name, self._str, self._occurrence_id, 
-                                      self._compilable_layer_indices_tup)
+                                      self._compilable_layer_indices_tup, saved_aux=self.saved_auxinfo)
         else: #create static copy
             if self._static:
                 #if presently static leverage precomputed hashable_tup and hash. 
@@ -1066,7 +1087,7 @@ def copy(self, editable='auto'):
                 return ret._copy_init(self._labels, self._line_labels, editable, 
                                       self._name, self._str, self._occurrence_id, 
                                       self._compilable_layer_indices_tup, 
-                                      self._hashable_tup, self._hash)
+                                      self._hashable_tup, self._hash, saved_aux=self.saved_auxinfo)
             else:
                 static_labels = tuple([layer_lbl if isinstance(layer_lbl, _Label) else _Label(layer_lbl) 
                                        for layer_lbl in self._labels])
@@ -1074,7 +1095,7 @@ def copy(self, editable='auto'):
                 return ret._copy_init(static_labels, self._line_labels, 
                                       editable, self._name, self._str, self._occurrence_id, 
                                       self._compilable_layer_indices_tup, 
-                                      hashable_tup, hash(hashable_tup))
+                                      hashable_tup, hash(hashable_tup), saved_aux=self.saved_auxinfo)
 
     def clear(self):
         """
@@ -1223,7 +1244,7 @@ def extract_labels(self, layers=None, lines=None, strict=True):
                     return self._labels[layers]
                 if isinstance(layers, slice) and strict is True:  # if strict=False, then need to recompute line labels
                     #can speed this up a measurably by manually computing the new hashable tuple value and hash
-                    if not self._line_labels in (('*',), ()):
+                    if self._line_labels not in (('*',), ()):
                         new_hashable_tup = self._labels[layers] + ('@',) + self._line_labels
                     else:
                         new_hashable_tup = self._labels[layers]
@@ -1450,15 +1471,20 @@ def insert_idling_layers_inplace(self, insert_before, num_to_insert, lines=None)
         None
         """
         assert(not self._static), "Cannot edit a read-only circuit!"
-        if insert_before is None: insert_before = len(self._labels)
-        elif insert_before < 0: insert_before = len(self._labels) + insert_before
-
+        if insert_before is None:
+            insert_before = len(self._labels)
+        elif insert_before < 0:
+            insert_before = len(self._labels) + insert_before
+        
         if lines is None:  # insert complete layers
             for i in range(num_to_insert):
                 self._labels.insert(insert_before, [])
 
             #Shift compilable layer indices as needed
             if self._compilable_layer_indices_tup:
+                if num_to_insert <= 0 and insert_before < len(self._labels):
+                    raise ValueError('Undefined behavior (at least until the ' \
+                    'documentation is updated).')
                 shifted_inds = [i if (i < insert_before) else (i + num_to_insert)
                                 for i in self._compilable_layer_indices_tup]
                 self._compilable_layer_indices_tup = tuple(shifted_inds)
@@ -2235,7 +2261,7 @@ def insert_layer_inplace(self, circuit_layer, j):
 
         self.insert_labels_into_layers_inplace([circuit_layer], j)
 
-    def insert_circuit(self, circuit, j):
+    def insert_circuit(self, circuit: Circuit, j):
         """
         Inserts a circuit into this circuit, returning a copy.
 
@@ -2263,7 +2289,7 @@ def insert_circuit(self, circuit, j):
         if self._static: cpy.done_editing()
         return cpy
 
-    def insert_circuit_inplace(self, circuit, j):
+    def insert_circuit_inplace(self, circuit: Circuit, j):
         """
         Inserts a circuit into this circuit.
 
@@ -2298,7 +2324,7 @@ def insert_circuit_inplace(self, circuit, j):
         labels_to_insert = circuit.extract_labels(layers=None, lines=lines_to_insert)
         self.insert_labels_into_layers_inplace(labels_to_insert, j)
 
-    def append_circuit(self, circuit):
+    def append_circuit(self, circuit: Circuit):
         """
         Append a circuit to the end of this circuit, returning a copy.
 
@@ -2316,7 +2342,7 @@ def append_circuit(self, circuit):
         """
         return self.insert_circuit(circuit, self.num_layers)
 
-    def append_circuit_inplace(self, circuit):
+    def append_circuit_inplace(self, circuit: Circuit):
         """
         Append a circuit to the end of this circuit.
 
@@ -2335,7 +2361,7 @@ def append_circuit_inplace(self, circuit):
         assert(not self._static), "Cannot edit a read-only circuit!"
         self.insert_circuit_inplace(circuit, self.num_layers)
 
-    def prefix_circuit(self, circuit):
+    def prefix_circuit(self, circuit: Circuit):
         """
         Prefix a circuit to the beginning of this circuit, returning a copy.
 
@@ -2353,7 +2379,7 @@ def prefix_circuit(self, circuit):
         """
         return self.insert_circuit(circuit, 0)
 
-    def prefix_circuit_inplace(self, circuit):
+    def prefix_circuit_inplace(self, circuit: Circuit):
         """
         Prefix a circuit to the beginning of this circuit.
 
@@ -2372,7 +2398,7 @@ def prefix_circuit_inplace(self, circuit):
         assert(not self._static), "Cannot edit a read-only circuit!"
         self.insert_circuit_inplace(circuit, 0)
 
-    def tensor_circuit_inplace(self, circuit, line_order=None):
+    def tensor_circuit_inplace(self, circuit: Circuit, line_order=None):
         """
         The tensor product of this circuit and `circuit`.
 
@@ -2425,8 +2451,9 @@ def tensor_circuit_inplace(self, circuit, line_order=None):
         #Add circuit's labels into this circuit
         self.insert_labels_as_lines_inplace(circuit._labels, line_labels=circuit.line_labels)
         self._line_labels = new_line_labels  # essentially just reorders labels if needed
+        self.saved_auxinfo["lanes"].update(circuit.saved_auxinfo["lanes"])
 
-    def tensor_circuit(self, circuit, line_order=None):
+    def tensor_circuit(self, circuit: Circuit, line_order=None):
         """
         The tensor product of this circuit and `circuit`, returning a copy.
 
@@ -2453,8 +2480,34 @@ def tensor_circuit(self, circuit, line_order=None):
         cpy.tensor_circuit_inplace(circuit, line_order)
         if self._static: cpy.done_editing()
         return cpy
+    
+    def _cache_tensor_lanes(self, sub_circuit_list: list[_Label],
+                            lane_to_qubits: dict[int, Tuple[int, ...]]) -> Circuit:
+        """
+        Store the tensor lanes in the circuit if appropriate. 
+        Note that this should only be called in the case that the sub_circuit_list
+        when tensored is equivalent to the current circuit.
+        """
+
+        if "lanes" in self.saved_auxinfo and len(self) > 0:
+            if len(self.saved_auxinfo["lanes"]) == 1:
+                # We will update this because it is now believed that
+                # we are able to conduct the operation in cross talk free lanes.
+                qubits_used = set()
+                for qub_in_lane in lane_to_qubits.values():
+                    qubits_used = qubits_used.union(qub_in_lane)
+
+                if len(qubits_used) != self.num_lines:
+                    # Do not update.
+                    return self
+                
+                self.saved_auxinfo["lanes"] = {} # Reset lanes info
+                for i, qubit_labels in lane_to_qubits.items():
+                    self.saved_auxinfo["lanes"][tuple(sorted(qubit_labels))] = sub_circuit_list[i]
+                    
+        return self
 
-    def replace_layer_with_circuit_inplace(self, circuit, j):
+    def replace_layer_with_circuit_inplace(self, circuit: Circuit, j):
         """
         Replaces the `j`-th layer of this circuit with `circuit`.
 
@@ -2474,7 +2527,7 @@ def replace_layer_with_circuit_inplace(self, circuit, j):
         del self[j]
         self.insert_labels_into_layers_inplace(circuit, j)
 
-    def replace_layer_with_circuit(self, circuit, j):
+    def replace_layer_with_circuit(self, circuit: Circuit, j):
         """
         Replaces the `j`-th layer of this circuit with `circuit`,
         returning a copy.
@@ -2521,9 +2574,15 @@ def replace_gatename_inplace(self, old_gatename, new_gatename):
 
         def replace(obj):  # obj is either a simple label or a list
             if isinstance(obj, _Label):
-                if obj.name == old_gatename:
-                    newobj = _Label(new_gatename, obj.sslbls)
-                else: newobj = obj
+                newobj = _Label(new_gatename, obj.sslbls) if (obj.name == old_gatename) else obj
+            elif obj == old_gatename:
+                if len(obj) == 0:
+                    sslbls = self.line_labels
+                else:
+                    import warnings
+                    warnings.warn(f'Cannot infer target of gate(s) of {obj}.')
+                    sslbls = tuple()
+                newobj = _Label((new_gatename,) + sslbls)
             else:
                 newobj = [replace(sub) for sub in obj]
             return newobj
@@ -2588,7 +2647,6 @@ def replace(obj):  # obj is either a simple label or a list
                 else:
                     ret.append(replace(sub))
             return ret
-
         self._labels = replace(self._labels)
 
     def replace_gatename_with_idle(self, gatename):
@@ -3342,7 +3400,7 @@ def layer_with_idles(self, j, idle_gate_name='I'):
         """
         return tuple(self.layer_label_with_idles(j, idle_gate_name).components)
 
-    def layer_label_with_idles(self, j, idle_gate_name='I'):
+    def layer_label_with_idles(self, j, idle_gate_name : str|_Label ='I'):
         """
         Returns the layer, as a :class:`Label`, at depth j, with `idle_gate_name` at empty circuit locations.
 
@@ -4000,7 +4058,7 @@ def convert_to_cirq(self,
         return cirq.Circuit(moments)
     
     @classmethod
-    def from_cirq(cls, circuit, qubit_conversion=None, cirq_gate_conversion= None,
+    def from_cirq(cls, circuit: Circuit, qubit_conversion=None, cirq_gate_conversion= None,
                   remove_implied_idles = True, global_idle_replacement_label = 'auto'):
         """
         Converts and instantiates a pyGSTi Circuit object from a Cirq Circuit object.
@@ -4637,6 +4695,8 @@ def done_editing(self):
                                   else _Label(layer_lbl) for layer_lbl in self._labels])
         self._hashable_tup = self.tup
         self._hash = hash(self._hashable_tup)
+        self._str = None
+        self._str = self.str # this accessor recomputes the value of self._str
 
 class CompressedCircuit(object):
     """
@@ -4667,7 +4727,7 @@ class CompressedCircuit(object):
         takes more time but could result in better compressing.
     """
 
-    def __init__(self, circuit, min_len_to_compress=20, max_period_to_look_for=20):
+    def __init__(self, circuit: Circuit, min_len_to_compress=20, max_period_to_look_for=20):
         """
         Create a new CompressedCircuit object
 
diff --git a/pygsti/circuits/circuitlist.py b/pygsti/circuits/circuitlist.py
index 4d11d385d..09a5c5ed0 100644
--- a/pygsti/circuits/circuitlist.py
+++ b/pygsti/circuits/circuitlist.py
@@ -201,6 +201,14 @@ def __setstate__(self, state_dict):
         self.__dict__.update(state_dict)
         if 'uuid' not in state_dict:  # backward compatibility
             self.uuid = _uuid.uuid4()  # create a new uuid
+    
+    def tensor_circuits(self, other_circuitlist, new_name=None):
+        assert len(self) == len(other_circuitlist)
+        circuits = []
+        for c1,c2 in zip(self._circuits, other_circuitlist._circuits):
+            circuits.append(c1.tensor_circuit(c2))
+        out = CircuitList(circuits, name=new_name)
+        return out
 
     def elementvec_to_array(self, elementvec, layout, mergeop="sum"):
         """
diff --git a/pygsti/circuits/split_circuits_into_lanes.py b/pygsti/circuits/split_circuits_into_lanes.py
new file mode 100644
index 000000000..7650ca0ca
--- /dev/null
+++ b/pygsti/circuits/split_circuits_into_lanes.py
@@ -0,0 +1,186 @@
+import numpy as _np
+
+from typing import Sequence, Dict, Tuple, Optional, Set
+from pygsti.circuits import Circuit as Circuit
+from pygsti.baseobjs.label import Label, LabelTupTup
+
+
+def compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit: Circuit) -> tuple[dict[int, int],
+                                                                                        dict[int, tuple[int]]]:
+    """
+    Parameters:
+    ------------
+    circuit: Circuit - the circuit to compute qubit to lanes mapping for
+
+    num_qubits: int - The total number of qubits expected in the circuit.
+
+    Returns
+    --------
+    Dictionary mapping qubit number to lane number in the circuit.
+    """
+
+    qubits_to_potentially_entangled_others = {i: set((i,)) for i in range(circuit.num_lines)}
+    num_layers = circuit.num_layers
+    for layer_ind in range(num_layers):
+        layer = circuit.layer(layer_ind)
+        for op in layer:
+            qubits_used = op.qubits
+            for qb in qubits_used:
+                qubits_to_potentially_entangled_others[qb].update(set(qubits_used))
+
+    lanes = {}
+    lan_num = 0
+    visited: dict[int, int] = {}
+    def reachable_nodes(starting_point: int,
+                        graph_qubits_to_neighbors: dict[int, set[int]],
+                        visited: dict[int, set[int]]):
+        """
+        Find which nodes are reachable from this starting point.
+        """
+        if starting_point in visited:
+            return visited[starting_point]
+        else:
+            assert starting_point in graph_qubits_to_neighbors
+            visited[starting_point] = graph_qubits_to_neighbors[starting_point]
+            output = set(visited[starting_point])
+            for child in graph_qubits_to_neighbors[starting_point]:
+                if child != starting_point:
+                    output.update(output, reachable_nodes(child, graph_qubits_to_neighbors, visited))
+            visited[starting_point] = output
+            return output
+
+    available_starting_points = list(sorted(qubits_to_potentially_entangled_others.keys()))
+    while available_starting_points:
+        sp = available_starting_points[0]
+        nodes = reachable_nodes(sp, qubits_to_potentially_entangled_others, visited)
+        for node in nodes:
+            available_starting_points.remove(node)
+        lanes[lan_num] = nodes
+        lan_num += 1
+
+    def compute_qubits_to_lanes(lanes_to_qubits: dict[int, set[int]]) -> dict[int, int]:
+        """
+        Determine a mapping from qubit to the lane it is in for this specific circuit.
+        """
+        out = {}
+        for key, val in lanes_to_qubits.items():
+            for qb in val:
+                out[qb] = key
+        return out
+
+    return compute_qubits_to_lanes(lanes), lanes
+
+
+def compute_subcircuits(circuit: Circuit,
+                        qubit_to_lanes: dict[int, int],
+                        lane_to_qubits: dict[int, tuple[int, ...]],
+                        cache_lanes_in_circuit: bool = False) -> list[list[LabelTupTup]]:
+    """
+    Split a circuit into multiple subcircuits which do not talk across lanes.
+    """
+
+    if "lanes" in circuit.saved_auxinfo:
+        # Check if the lane info matches and I can just return that set up.
+        if len(lane_to_qubits) == len(circuit.saved_auxinfo["lanes"]):
+            # We may have this already in cache.
+
+            lanes_to_gates = [[] for _ in range(len(lane_to_qubits))]
+            for i, key in lane_to_qubits.items():
+                if tuple(sorted(key)) in circuit.saved_auxinfo["lanes"]:
+                    lanes_to_gates[i] = circuit.saved_auxinfo["lanes"][tuple(sorted(key))]
+
+                else:
+                    raise ValueError(f"lbl cache miss: {key} in circuit {circuit}")
+            return lanes_to_gates
+
+    lanes_to_gates = [[] for _ in range(_np.unique(list(qubit_to_lanes.values())).shape[0])]
+
+    num_layers = circuit.num_layers
+    for layer_ind in range(num_layers):
+        layer = circuit.layer_with_idles(layer_ind)
+        group = []
+        group_lane = None
+        sorted_layer = sorted(layer, key=lambda x: x.qubits[0])
+
+        for op in sorted_layer:
+            # We need this to be sorted by the qubit number so we do not get that a lane was split Q1 Q3 Q2 in the layer where Q1 and Q2 are in the same lane.
+            qubits_used = op.qubits # This will be a list of qubits used.
+            # I am assuming that the qubits are indexed numerically and not by strings.
+            lane = qubit_to_lanes[qubits_used[0]]
+
+            if group_lane is None:
+                group_lane = lane
+                group.append(op)
+            elif group_lane == lane:
+                group.append(op)
+            else:
+                lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
+                group_lane = lane
+                group = [op]
+
+        if len(group) > 0:
+            # We have a left over group.
+            lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
+
+    if cache_lanes_in_circuit:
+        circuit = circuit._cache_tensor_lanes(lanes_to_gates, lane_to_qubits)
+
+    if num_layers == 0:
+        return lanes_to_gates
+
+    return lanes_to_gates
+
+
+@staticmethod
+def batch_tensor(
+    circuits : Sequence[Circuit],
+    layer_mappers: Dict[int, Dict],
+    global_line_order: Optional[Tuple[int,...]] = None,
+    target_lines : Optional[Sequence[Tuple[int,...]]] = None
+    ) -> Circuit:
+    """
+    """
+    assert len(circuits) > 0
+
+    if target_lines is None:
+        target_lines = []
+        total_lines = 0
+        max_cir_len = 0
+        for c in circuits:
+            target_lines.append(tuple(range(total_lines, total_lines + c.num_lines)))
+            total_lines += c.num_lines
+            max_cir_len = max(max_cir_len, len(c))
+    else:
+        total_lines = sum([c.num_lines for c in circuits])
+        max_cir_len = max(*[len(c) for c in circuits])
+
+    s : Set[int] = set()
+    for c, t in zip(circuits, target_lines):
+        assert not s.intersection(t)
+        assert len(t) == c.num_lines
+        s.update(t)
+
+    if global_line_order is None:
+        global_line_order = tuple(sorted(list(s)))
+    
+    c = circuits[0].copy(editable=True)
+    c._append_idling_layers_inplace(max_cir_len - len(c))
+    c.done_editing()
+    # ^ That changes the format of c._labels. We need to edit c while in this format,
+    #   so the next line sets c._static = False. (We repeat this pattern in the loop below.)
+    c._static = False
+    c._labels = [layer_mappers[c.num_lines][ell] for ell in c._labels]
+    c.map_state_space_labels_inplace({k:v for k,v in zip(c.line_labels, target_lines[0])})
+    c.done_editing()
+    for i, c2 in enumerate(circuits[1:]):
+        c2 = c2.copy(editable=True)
+        c2._append_idling_layers_inplace(max_cir_len - len(c2))
+        c2.done_editing()
+        c2._static = False
+        c2._labels = [layer_mappers[c2.num_lines][ell] for ell in c2._labels]
+        c2.map_state_space_labels_inplace({k:v for k,v in zip(c2.line_labels, target_lines[i+1])})
+        c2.done_editing()
+        c = c.tensor_circuit(c2)
+
+    c = c.reorder_lines(global_line_order)
+    return c
diff --git a/pygsti/forwardsims/forwardsim.py b/pygsti/forwardsims/forwardsim.py
index 85d43899e..5af17fb9a 100644
--- a/pygsti/forwardsims/forwardsim.py
+++ b/pygsti/forwardsims/forwardsim.py
@@ -21,7 +21,10 @@
 from pygsti.baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation
 from pygsti.baseobjs.nicelyserializable import NicelySerializable as _NicelySerializable
 from pygsti.tools import slicetools as _slct
-from typing import Union, Callable, Literal
+from typing import Union, Callable, Literal, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from pygsti.models.model import OpModel
 
 
 class ForwardSimulator(_NicelySerializable):
@@ -96,7 +99,7 @@ def _array_types_for_method(cls, method_name):
             return ('ep', 'ep') + cls._array_types_for_method('_bulk_fill_dprobs_block')
         return ()
 
-    def __init__(self, model=None):
+    def __init__(self, model: OpModel=None):
         super().__init__()
         #self.dim = model.dim
         self.model = model
@@ -128,11 +131,11 @@ def __getstate__(self):
         return state_dict
 
     @property
-    def model(self):
+    def model(self) -> OpModel:
         return self._model
 
     @model.setter
-    def model(self, val):
+    def model(self, val: OpModel):
         self._model = val
         try:
             evotype = None if val is None else self._model.evotype
diff --git a/pygsti/forwardsims/mapforwardsim.py b/pygsti/forwardsims/mapforwardsim.py
index 3252ade38..d650573c0 100644
--- a/pygsti/forwardsims/mapforwardsim.py
+++ b/pygsti/forwardsims/mapforwardsim.py
@@ -15,6 +15,7 @@
 
 import numpy as _np
 from numpy import linalg as _nla
+import time
 
 from pygsti.forwardsims.distforwardsim import DistributableForwardSimulator as _DistributableForwardSimulator
 from pygsti.forwardsims.forwardsim import ForwardSimulator as _ForwardSimulator
@@ -360,13 +361,16 @@ def create_copa_layout_circuit_cache(circuits, model, dataset=None):
 
     def _bulk_fill_probs_atom(self, array_to_fill, layout_atom, resource_alloc):
         # Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller
-        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * self.model.dim)
+        # resource_alloc.check_can_allocate_memory(layout_atom.cache_size * self.model.dim)
+        start_time = time.time()
         self.calclib.mapfill_probs_atom(self, array_to_fill, slice(0, array_to_fill.shape[0]),  # all indices
                                         layout_atom, resource_alloc)
+        end_time = time.time()
+        print("Time to compute forward probs with map Forward after fixed layout (s): ", end_time - start_time)
 
     def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc):
         # Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller
-        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * self.model.dim * _slct.length(param_slice))
+        # resource_alloc.check_can_allocate_memory(layout_atom.cache_size * self.model.dim * _slct.length(param_slice))
         self.calclib.mapfill_dprobs_atom(self, array_to_fill, slice(0, array_to_fill.shape[0]), dest_param_slice,
                                          layout_atom, param_slice, resource_alloc, self.derivative_eps)
 
diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index 75b745305..88b7491e7 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -10,10 +10,12 @@
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
 
+import time
 import collections as _collections
 import time as _time
 import warnings as _warnings
 
+from tqdm import tqdm
 import numpy as _np
 import numpy.linalg as _nla
 
@@ -21,7 +23,10 @@
 from pygsti.forwardsims.forwardsim import ForwardSimulator as _ForwardSimulator
 from pygsti.forwardsims.forwardsim import _bytes_for_array_types
 from pygsti.layouts.evaltree import EvalTree as _EvalTree
+from pygsti.layouts.evaltree import EvalTreeBasedUponLongestCommonSubstring as _EvalTreeLCS
+from pygsti.layouts.evaltree import setup_circuit_list_for_LCS_computations, CollectionOfLCSEvalTrees
 from pygsti.layouts.matrixlayout import MatrixCOPALayout as _MatrixCOPALayout
+from pygsti.layouts.matrixlayout import _MatrixCOPALayoutAtomWithLCS
 from pygsti.baseobjs.profiler import DummyProfiler as _DummyProfiler
 from pygsti.baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation
 from pygsti.baseobjs.verbosityprinter import VerbosityPrinter as _VerbosityPrinter
@@ -31,7 +36,13 @@
 from pygsti.tools.matrixtools import _fas
 from pygsti import SpaceT
 from pygsti.tools import listtools as _lt
-from pygsti.circuits import CircuitList as _CircuitList
+from pygsti.circuits import CircuitList as _CircuitList, Circuit as _Circuit
+from pygsti.tools.internalgates import internal_gate_unitaries
+from pygsti.tools.optools import unitary_to_superop
+from pygsti.baseobjs.label import LabelTup, LabelTupTup, Label
+
+from typing import Sequence, Optional
+
 
 _dummy_profiler = _DummyProfiler()
 
@@ -696,8 +707,9 @@ def _array_types_for_method(cls, method_name):
         return super()._array_types_for_method(method_name)
 
     def __init__(self, model=None, distribute_by_timestamp=False, num_atoms=None, processor_grid=None,
-                 param_blk_sizes=None):
+                 param_blk_sizes=None, cache_doperations=True):
         super().__init__(model, num_atoms, processor_grid, param_blk_sizes)
+        self._cache_dops = cache_doperations
         self._mode = "distribute_by_timestamp" if distribute_by_timestamp else "time_independent"
 
     def _to_nice_serialization(self):
@@ -739,6 +751,9 @@ def _compute_product_cache(self, layout_atom_tree, resource_alloc):
         eval_tree = layout_atom_tree
         cacheSize = len(eval_tree)
         prodCache = _np.zeros((cacheSize, dim, dim), 'd')
+        # ^ This assumes assignments prodCache[i] = <2d numpy array>.
+        #   It would be better for this to be a dict (mapping _most likely_
+        #   to ndarrays) if we don't need slicing or other axis indexing.
         scaleCache = _np.zeros(cacheSize, 'd')
 
         for iDest, iRight, iLeft in eval_tree:
@@ -752,7 +767,10 @@ def _compute_product_cache(self, layout_atom_tree, resource_alloc):
                 else:
                     gate = self.model.circuit_layer_operator(opLabel, 'op').to_dense("minimal")
                     nG = max(_nla.norm(gate), 1.0)
+                    # ^ This indicates a need to compute norms of the operation matrices. Can't do this
+                    #   with scipy.linalg if gate is represented implicitly. 
                     prodCache[iDest] = gate / nG
+                    # ^ Indicates a need to overload division by scalars.
                     scaleCache[iDest] = _np.log(nG)
                 continue
 
@@ -765,10 +783,12 @@ def _compute_product_cache(self, layout_atom_tree, resource_alloc):
             scaleCache[iDest] = scaleCache[iLeft] + scaleCache[iRight]
 
             if prodCache[iDest].max() < _PSMALL and prodCache[iDest].min() > -_PSMALL:
-                nL, nR = max(_nla.norm(L), _np.exp(-scaleCache[iLeft]),
-                             1e-300), max(_nla.norm(R), _np.exp(-scaleCache[iRight]), 1e-300)
+                nL = max(_nla.norm(L), _np.exp(-scaleCache[iLeft]), 1e-300)
+                nR = max(_nla.norm(R), _np.exp(-scaleCache[iRight]), 1e-300)
                 sL, sR = L / nL, R / nR
-                prodCache[iDest] = _np.dot(sL, sR); scaleCache[iDest] += _np.log(nL) + _np.log(nR)
+                # ^ Again, shows the need to overload division by scalars.
+                prodCache[iDest] = sL @ sR
+                scaleCache[iDest] += _np.log(nL) + _np.log(nR)
 
         nanOrInfCacheIndices = (~_np.isfinite(prodCache)).nonzero()[0]  # may be duplicates (a list, not a set)
         # since all scaled gates start with norm <= 1, products should all have norm <= 1
@@ -839,6 +859,8 @@ def _compute_dproduct_cache(self, layout_atom_tree, prod_cache, scale_cache,
 
         tSerialStart = _time.time()
         dProdCache = _np.zeros((cacheSize,) + deriv_shape)
+        # ^ I think that deriv_shape will be a tuple of length > 2.
+        #   (Based on how swapaxes is used in the loop below ...)
         wrtIndices = _slct.indices(wrt_slice) if (wrt_slice is not None) else None
 
         for iDest, iRight, iLeft in eval_tree:
@@ -852,6 +874,9 @@ def _compute_dproduct_cache(self, layout_atom_tree, prod_cache, scale_cache,
                     #doperation = self.dproduct( (opLabel,) , wrt_filter=wrtIndices)
                     doperation = self._doperation(opLabel, wrt_filter=wrtIndices)
                     dProdCache[iDest] = doperation / _np.exp(scale_cache[iDest])
+                    # ^ Need a way to track tensor product structure in whatever's 
+                    #   being returned by self._doperation (presumably it's a tensor ...)
+
                 continue
 
             tm = _time.time()
@@ -862,8 +887,18 @@ def _compute_dproduct_cache(self, layout_atom_tree, prod_cache, scale_cache,
             # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
             L, R = prod_cache[iLeft], prod_cache[iRight]
             dL, dR = dProdCache[iLeft], dProdCache[iRight]
-            dProdCache[iDest] = _np.dot(dL, R) + \
-                _np.swapaxes(_np.dot(L, dR), 0, 1)  # dot(dS, T) + dot(S, dT)
+            term1 = _np.dot(dL, R)
+            term2 = _np.swapaxes(_np.dot(L, dR), 0, 1) 
+            # ^ From the numpy docs on .dot :
+            #
+            #   If a is an N-D array and b is an M-D array (where M>=2),
+            #   it is a sum product over the last axis of a and the second-to-last axis of b:
+            #
+            #       dot(a, b)[i,j,k,m] = sum(a[i,j,:] * b[k,:,m])
+            #
+            dProdCache[iDest] = term1 +  term2  # dot(dS, T) + dot(S, dT)
+            # ^ We need addition of tensor-product-structured "doperators."
+
             profiler.add_time("compute_dproduct_cache: dots", tm)
             profiler.add_count("compute_dproduct_cache: dots")
 
@@ -871,9 +906,12 @@ def _compute_dproduct_cache(self, layout_atom_tree, prod_cache, scale_cache,
             if abs(scale) > 1e-8:  # _np.isclose(scale,0) is SLOW!
                 dProdCache[iDest] /= _np.exp(scale)
                 if dProdCache[iDest].max() < _DSMALL and dProdCache[iDest].min() > -_DSMALL:
+                    # ^ Need the tensor-product-structured "doperators" to have .max() and .min()
+                    #   methods.
                     _warnings.warn("Scaled dProd small in order to keep prod managable.")
             elif (_np.count_nonzero(dProdCache[iDest]) and dProdCache[iDest].max() < _DSMALL
                   and dProdCache[iDest].min() > -_DSMALL):
+                # ^ Need to bypass the call to _np.count_nonzero(...).
                 _warnings.warn("Would have scaled dProd but now will not alter scale_cache.")
 
         #profiler.print_mem("DEBUGMEM: POINT2"); profiler.comm.barrier()
@@ -1034,7 +1072,7 @@ def _compute_hproduct_cache(self, layout_atom_tree, prod_cache, d_prod_cache1,
         return hProdCache
 
     def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E',),
-                      derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache= None):
+                      derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache= None, use_old_tree_style: bool = True):
         """
         Constructs an circuit-outcome-probability-array (COPA) layout for a list of circuits.
 
@@ -1121,7 +1159,7 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types
 
         layout = _MatrixCOPALayout(circuits, self.model, dataset, natoms,
                                    na, npp, param_dimensions, param_blk_sizes, resource_alloc, verbosity, 
-                                   layout_creation_circuit_cache=layout_creation_circuit_cache)
+                                   layout_creation_circuit_cache=layout_creation_circuit_cache, use_old_tree_style=use_old_tree_style)
 
         if mem_limit is not None:
             loc_nparams1 = num_params / npp[0] if len(npp) > 0 else 0
@@ -1220,6 +1258,7 @@ def _probs_from_rho_e(self, rho, e, gs, scale_vals):
         #  vp[i] = sum_k e[0,k] dot(gs, rho)[i,k,0]  * scale_vals[i]
         #  vp[i] = dot( e, dot(gs, rho))[0,i,0]      * scale_vals[i]
         #  vp    = squeeze( dot( e, dot(gs, rho)), axis=(0,2) ) * scale_vals
+        # breakpoint()
         return _np.squeeze(_np.dot(e, _np.dot(gs, rho)), axis=(0, 2)) * scale_vals
         # shape == (len(circuit_list),) ; may overflow but OK
 
@@ -1491,6 +1530,32 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom, resource_alloc):
         _np.seterr(**old_err)
 
     def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc):
+        if not self._cache_dops:
+            # This call errors because it tries to compute layout_atom.as_layout(resource_alloc),
+            # which isn't implemented. Looking at how layout_atom is used in the other branch
+            # of this if-statement it isn't clear how to work around this. Can look at 
+            # MapForwardSimulator._bulk_fill_dprobs_atom(...).
+            # 
+            # Verbatim contents:
+            #
+            #       resource_alloc.check_can_allocate_memory(layout_atom.cache_size * self.model.dim * _slct.length(param_slice))
+            #       self.calclib.mapfill_dprobs_atom(self, array_to_fill, slice(0, array_to_fill.shape[0]), dest_param_slice,
+            #                                  layout_atom, param_slice, resource_alloc, self.derivative_eps)
+            #
+            # where
+            #
+            #       self.calclib = _importlib.import_module("pygsti.forwardsims.mapforwardsim_calc_" + evotype.name).
+            # 
+            # and an implementation can be found at
+            #
+            #       /Users/rjmurr/Documents/pg-xfgst/repo/pygsti/forwardsims/mapforwardsim_calc_generic.py.
+            #
+            # Specifically, in mapfill_probs_atom. But that doesn't do anything like layout_atom.as_layout(resource_alloc) .... :(
+            # 
+
+            _DistributableForwardSimulator._bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc)
+            return
+        
         dim = self.model.evotype.minimal_dim(self.model.state_space)
         resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim * dim * _slct.length(param_slice))
         prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
@@ -2104,3 +2169,407 @@ def bulk_fill_timedep_dloglpp(self, array_to_fill, layout, ds_circuits, num_tota
                                                   layout.resource_alloc())
         return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
                                               dataset, ds_cache)
+
+
+class LCSEvalTreeMatrixForwardSimulator(MatrixForwardSimulator):
+
+    def bulk_product(self, circuits, scale=False, resource_alloc=None):
+        """
+        Compute the products of many circuits at once.
+
+        Parameters
+        ----------
+        circuits : list of Circuits
+            The circuits to compute products for.  These should *not* have any preparation or
+            measurement layers.
+
+        scale : bool, optional
+            When True, return a scaling factor (see below).
+
+        resource_alloc : ResourceAllocation
+            Available resources for this computation. Includes the number of processors
+            (MPI comm) and memory limit.
+
+        Returns
+        -------
+        prods : numpy array
+            Array of shape S x G x G, where:
+            - S == the number of operation sequences
+            - G == the linear dimension of a operation matrix (G x G operation matrices).
+        scaleValues : numpy array
+            Only returned when scale == True. A length-S array specifying
+            the scaling that needs to be applied to the resulting products
+            (final_product[i] = scaleValues[i] * prods[i]).
+        """
+        resource_alloc = _ResourceAllocation.cast(resource_alloc)
+
+        my_data = setup_circuit_list_for_LCS_computations(circuits, None)
+
+        full_tree = CollectionOfLCSEvalTrees(my_data[2], my_data[1], my_data[0])
+
+        full_tree.collapse_circuits_to_process_matrices(self.model, None)
+        Gs = full_tree.reconstruct_full_matrices()
+
+        return Gs
+
+    def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS, resource_alloc):
+        
+        # Overestimate the amount of cache usage by assuming everything is the same size.
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+        # resource_alloc.check_can_allocate_memory(len(layout_atom.tree.cache) * dim**2)  # prod cache
+
+        layout_atom.tree.collapse_circuits_to_process_matrices(self.model, None)
+
+        Gs, all_circuits = layout_atom.tree.reconstruct_full_matrices()
+
+        sp_val, povm_vals, element_indices, tree_indices = self._rho_es_elm_inds_and_tree_inds_from_spam_tuples(layout_atom)
+        old_err = _np.seterr(over='ignore')
+        # for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+        #     # "element indices" index a circuit outcome probability in array_to_fill's first dimension
+        #     # "tree indices" index a quantity for a no-spam circuit in a computed cache, which correspond
+        #     #  to the the element indices when `spamtuple` is used.
+        #     # (Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller)
+        #     rho, E = self._rho_e_from_spam_tuple(spam_tuple)
+        #     _fas(array_to_fill, [element_indices],
+        #          self._probs_from_rho_e(rho, E, Gs[tree_indices], 1))
+        probs = self._probs_from_rho_e(sp_val, povm_vals, Gs[tree_indices], 1)
+        # if not isinstance(element_indices, list):
+        #     element_indices = [element_indices]
+        # collapse element list
+        
+        # _fas(array_to_fill, element_indices, probs)
+        array_to_fill[:] = probs
+        _np.seterr(**old_err)
+
+    def _probs_from_rho_e(self, rho, e: _np.ndarray, gs, scale_vals = 1, return_two_D: bool = False):
+        """
+        Compute the probabilities from rho, a set of povms, the circuits defined by gs, and then scale appropriately.
+        """
+
+        assert e.ndim == 2
+        ## WHY ??? assert e[0] > 1
+        out = _np.zeros((len(e), len(gs)))
+        for i in range(len(gs)):
+            out[:, i] = _np.squeeze(e @ (gs[i] @ rho), axis=(1))
+
+        if not return_two_D:
+            out = out.reshape((len(gs)*len(e)))
+        return out
+        return _np.squeeze(e @ (gs @ rho), axis=(2)) # only one rho.
+
+        return super()._probs_from_rho_e(rho, e, gs, scale_vals)
+
+    def _layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(self,
+                        layout_atom: _MatrixCOPALayoutAtomWithLCS):
+        
+        """
+        Assumes one state prep and many measurements.
+
+        We assume that there will be only one set of tree indices used throughout.
+        Also, we assume that we can find a slice 
+        """
+
+        sp_val = None
+        povm_vals: list[Label] = []
+        elm_inds: list[slice] = [] # This will get collapsed since we are assuming that each povm appears once.
+        tree_inds: list[slice] = [] # I am assuming that this will 
+        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+            if spam_tuple[0] != sp_val and sp_val is not None:
+                raise ValueError("More than one state prep is being used.")
+            else:
+                sp_val = spam_tuple[0]
+
+            
+            povm_vals.append(spam_tuple[1])
+            elm_inds.append(element_indices)
+            tree_inds.append(tree_indices)
+
+        sp_val = self.model.circuit_layer_operator(sp_val, "prep")
+
+        povm_vals = [self.model.circuit_layer_operator(elabel, 'povm') for elabel in povm_vals]
+
+        tree_inds = _np.unique(tree_inds)[0]
+        return sp_val, povm_vals, elm_inds, tree_inds        
+
+    def _rho_es_elm_inds_and_tree_inds_from_spam_tuples(self,
+                    layout_atom: _MatrixCOPALayoutAtomWithLCS) -> tuple[_np.ndarray, _np.ndarray]:
+        """
+        Assumes one state prep and many measurements.
+
+        We assume that there will be only one set of tree indices used throughout.
+        Also, we assume that we can find a slice 
+        """
+
+        sp_val = None
+        povm_vals: list[Label] = []
+        elm_inds: list[slice] = [] # This will get collapsed since we are assuming that each povm appears once.
+        tree_inds: list[slice] = [] # I am assuming that this will 
+        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+            if spam_tuple[0] != sp_val and sp_val is not None:
+                raise ValueError("More than one state prep is being used.")
+            else:
+                sp_val = spam_tuple[0]
+
+            
+            povm_vals.append(spam_tuple[1])
+            elm_inds.append(element_indices)
+            tree_inds.append(tree_indices)
+
+        sp_val, povm_vals = self._rho_es_from_spam_tuples(sp_val, povm_vals)
+        povm_vals = _np.vstack(povm_vals)
+        tree_inds = _np.unique(tree_inds)[0]
+        return sp_val, povm_vals, elm_inds, tree_inds
+
+    def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice: Optional[slice],
+                               layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice: Optional[slice], resource_alloc):
+        eps = 1e-7
+        if param_slice is None:
+            param_slice = slice(0, self.model.num_params)
+        param_indices = _slct.to_array(param_slice)
+
+        if dest_param_slice is None:
+            dest_param_slice = slice(0, len(param_indices))
+        dest_param_indices = _slct.to_array(dest_param_slice)
+
+        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
+
+        probs = _np.empty(layout_atom.num_elements, 'd')
+        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
+        orig_vec = self.model.to_vector().copy()
+
+        # CALL_BASIC = False
+        # CALL_BASIC_SET_PARAM_VALS = False
+        # CALL_SPLIT_OFF_SPAM_REDO_ALL_OF_TREE = False
+        # CALL_SPLIT_OFF_SPAM_PARTIAL_CACHE_TREE = True
+
+        # if CALL_BASIC:
+        #     return self._bulk_fill_dprobs_atom_using_from_vector(array_to_fill, layout_atom, param_indices, resource_alloc, iParamToFinal, probs, orig_vec, eps)
+        # elif CALL_BASIC_SET_PARAM_VALS:
+        #     return self._bulk_fill_dprobs_atom_using_set_param_vals_no_cache(array_to_fill, layout_atom, param_indices, resource_alloc, iParamToFinal, probs, orig_vec, eps)
+        # elif CALL_SPLIT_OFF_SPAM_REDO_ALL_OF_TREE:
+        #     return self._bulk_fill_dprobs_atom_with_from_vector_split_SPAM_off(array_to_fill, layout_atom, param_indices, resource_alloc, iParamToFinal, probs, orig_vec, eps)
+        # elif CALL_SPLIT_OFF_SPAM_PARTIAL_CACHE_TREE:
+        return self._bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(array_to_fill, layout_atom, param_indices, resource_alloc, iParamToFinal, probs, orig_vec, eps)
+
+    def create_layout(self, circuits : Sequence[_Circuit] | _CircuitList, dataset=None, resource_alloc=None, array_types=('E', ), derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None):
+        return super().create_layout(circuits, dataset, resource_alloc, array_types, derivative_dimensions, verbosity, layout_creation_circuit_cache, use_old_tree_style=False)
+
+
+
+    def _bulk_fill_dprobs_atom_using_from_vector(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_indices,
+                resource_alloc, iParamToFinal: dict[int, int], base_probs: _np.ndarray, orig_vec: _np.ndarray, eps: float = 1e-7):
+        """
+        Specifically use the from_vector method to update the model before the finite difference scheme.
+        """
+
+        probs2 = _np.empty(layout_atom.num_elements, 'd')
+
+        for i in range(len(param_indices)):
+
+            new_vec = orig_vec.copy()
+            new_vec[param_indices[i]] += eps
+            self.model.from_vector(new_vec)
+
+            self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
+
+            array_to_fill[:, iParamToFinal[param_indices[i]]] = (probs2 - base_probs) / eps
+
+        self.model.from_vector(orig_vec)
+
+    def _bulk_fill_dprobs_atom_using_set_param_vals_no_cache(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_indices,
+                resource_alloc, iParamToFinal: dict[int, int], base_probs: _np.ndarray, orig_vec: _np.ndarray, eps: float = 1e-7):
+        """
+        Specifically use the set_parameter_values method to update the model before the finite difference scheme which will recompute the whole LCS tree.
+        """
+        
+        # THIS METHOD FAILS TO PRODUCE THE CORRECT RESULTS! THIS IS BECAUSE THE SET_PARAMETER_VALUES call still does not work.
+
+        probs2 = _np.empty(layout_atom.num_elements, 'd')
+        
+        if len(param_indices) > 0:
+
+            first_ind = param_indices[0]
+            new_vec = orig_vec.copy()
+            new_vec[first_ind] += eps
+
+            self.model.set_parameter_value(first_ind, orig_vec[first_ind] + eps)
+
+            self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
+
+            array_to_fill[:, iParamToFinal[first_ind]] = (probs2 - base_probs) / eps
+
+        for i in range(1, len(param_indices)):
+            self.model.set_parameter_values([param_indices[i - 1], param_indices[i]],
+                                            [orig_vec[i-1], orig_vec[i] + eps])
+            new_vec = self.model.to_vector()
+            assert new_vec[i] == (orig_vec[i] + eps)
+            assert _np.allclose(new_vec[:i], orig_vec[:i] )
+            assert _np.allclose(new_vec[i+1:], orig_vec[i+1:])
+
+            self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
+
+            array_to_fill[:, iParamToFinal[first_ind]] = (probs2 - base_probs) / eps
+
+        self.model.from_vector(orig_vec)
+
+    def _bulk_fill_dprobs_atom_with_from_vector_split_SPAM_off(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS,
+                                                               param_indices: _np.ndarray, resource_alloc,
+                                                               iParamToFinal: dict[int, int], base_probs: _np.ndarray,
+                                                               orig_vec: _np.ndarray, eps: float = 1e-7,
+                                                               return_sequence_matrices: bool = False, recompute_whole_tree: bool = False):
+
+        original_Gs = layout_atom.tree.reconstruct_full_matrices()
+
+        probs2 = _np.empty(layout_atom.num_elements, 'd')
+        
+        sp_obj, povm_objs, _, tree_indices = self._layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(layout_atom)
+
+        orig_dense_sp = sp_obj.to_dense(on_space="minimal")[:,None] # To maintain expected shape.
+        dense_povms = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
+
+        if return_sequence_matrices:
+            output = []
+
+        _, rho_gpindices = self._process_wrt_filter(param_indices, sp_obj)
+        for i in range(len(rho_gpindices)):
+            iFinal = iParamToFinal[rho_gpindices[i]]
+            
+            new_vec = orig_vec.copy()
+            new_vec[rho_gpindices[i]] += eps
+            self.model.from_vector(new_vec)
+
+            dense_sp = sp_obj.to_dense(on_space="minimal")[:, None]
+            probs2 = self._probs_from_rho_e(dense_sp, dense_povms, original_Gs[tree_indices])
+
+            array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
+
+            if return_sequence_matrices:
+                output.append(original_Gs)
+ 
+        _, povm_gpindices = self._process_wrt_filter(param_indices, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
+        for i in range(len(povm_gpindices)):
+            iFinal = iParamToFinal[povm_gpindices[i]]
+            new_vec = orig_vec.copy()
+            new_vec[povm_gpindices[i]] += eps
+            self.model.from_vector(new_vec)
+
+            # We are only varying the POVMs.
+            effects = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
+
+            probs2 = self._probs_from_rho_e(orig_dense_sp, effects, original_Gs[tree_indices])
+            
+            array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
+
+            if return_sequence_matrices:
+                output.append(original_Gs)
+
+        remaining_param_inds = sorted(list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices))))
+
+        for i in range(len(remaining_param_inds)):
+            iFinal = iParamToFinal[remaining_param_inds[i]]
+
+            new_vec = orig_vec.copy()
+            new_vec[remaining_param_inds[i]] += eps
+            self.model.from_vector(new_vec)
+
+            layout_atom.tree.collapse_circuits_to_process_matrices(self.model, None)
+            Gs = layout_atom.tree.reconstruct_full_matrices()
+
+            probs2 = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs[tree_indices])
+
+            array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
+
+            if return_sequence_matrices:
+                output.append(Gs)
+
+        self.model.from_vector(orig_vec) # reset to the original model.
+
+        if return_sequence_matrices:
+            return output
+
+
+    def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS,
+                param_indices: _np.ndarray, resource_alloc, iParamToFinal: dict[int, int], base_probs: _np.ndarray,
+                orig_vec: _np.ndarray, eps: float = 1e-7, return_sequence_matrices: bool = False):
+        """
+        Use from_vector to update the model. Then, recompute only the part of the tree which you need to recompute.
+        """
+
+        original_Gs, all_cirs = layout_atom.tree.reconstruct_full_matrices()
+
+        probs2 = _np.empty(layout_atom.num_elements, 'd')
+        
+        sp_obj, povm_objs, elm_indices, tree_indices = self._layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(layout_atom)
+
+        elm_indices = _np.array([_slct.indices(elm) for elm in elm_indices])
+
+        orig_dense_sp = sp_obj.to_dense(on_space="minimal")[:,None] # To maintain expected shape.
+        dense_povms = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
+        if return_sequence_matrices:
+            output = []
+
+        _, rho_gpindices = self._process_wrt_filter(param_indices, sp_obj)
+        for i in range(len(rho_gpindices)):
+            probs2[:] = base_probs[:]
+            iFinal = iParamToFinal[rho_gpindices[i]]
+            
+            new_vec = orig_vec.copy()
+            new_vec[rho_gpindices[i]] += eps
+            self.model.from_vector(new_vec)
+
+            dense_sp = sp_obj.to_dense(on_space="minimal")[:, None]
+            probs2 = self._probs_from_rho_e(dense_sp, dense_povms, original_Gs[tree_indices])
+
+            array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
+
+            if return_sequence_matrices:
+                output.append(original_Gs)
+ 
+        _, povm_gpindices = self._process_wrt_filter(param_indices, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
+        for i in range(len(povm_gpindices)):
+            probs2[:] = base_probs[:]
+            iFinal = iParamToFinal[povm_gpindices[i]]
+            new_vec = orig_vec.copy()
+            new_vec[povm_gpindices[i]] += eps
+            self.model.from_vector(new_vec)
+
+            # We are only varying the POVMs.
+            effects = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
+
+            probs2 = self._probs_from_rho_e(orig_dense_sp, effects, original_Gs[tree_indices])
+            
+            array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
+
+            if return_sequence_matrices:
+                output.append(original_Gs)
+
+        remaining_param_inds = sorted(list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices))))
+
+        dirty_circuits = layout_atom.tree.determine_which_circuits_will_update_for_what_gpindices(self.model)
+
+        for i in range(len(remaining_param_inds)):
+            probs2[:] = base_probs[:] # Copy off the data
+            iFinal = iParamToFinal[remaining_param_inds[i]]
+
+            new_vec = orig_vec.copy()
+            new_vec[remaining_param_inds[i]] += eps
+            self.model.from_vector(new_vec)
+
+            layout_atom.tree.collapse_circuits_to_process_matrices(self.model, remaining_param_inds[i])
+            Gs, inds_to_update = layout_atom.tree.reconstruct_full_matrices(self.model, remaining_param_inds[i])
+
+            tmp = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs, return_two_D=True)
+
+            probs2[elm_indices[:, inds_to_update]] = tmp
+
+            array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
+
+            layout_atom.tree.reset_full_matrices_to_base_probs_version()
+
+            if return_sequence_matrices:
+                output.append(Gs)
+
+        self.model.from_vector(orig_vec) # reset to the original model.
+
+        if return_sequence_matrices:
+            return output
\ No newline at end of file
diff --git a/pygsti/layouts/longest_common_evaltree.py b/pygsti/layouts/longest_common_evaltree.py
new file mode 100644
index 000000000..0cb349384
--- /dev/null
+++ b/pygsti/layouts/longest_common_evaltree.py
@@ -0,0 +1,933 @@
+"""
+Defines the EvalTree class.
+"""
+#***************************************************************************************************
+# Copyright 2015, 2019, 2025 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
+# in this software.
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.  You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
+#***************************************************************************************************
+
+from __future__ import annotations
+import time as _time  # DEBUG TIMERS
+
+import numpy as _np
+
+from pygsti.circuits.circuit import Circuit as _Circuit, LayerTupLike
+from pygsti.baseobjs.verbosityprinter import VerbosityPrinter as _VerbosityPrinter
+from pygsti.baseobjs.label import LabelTupTup, Label, LabelTup
+import itertools
+from pygsti.tools.sequencetools import (
+    conduct_one_round_of_lcs_simplification,
+    _compute_lcs_for_every_pair_of_sequences,
+    create_tables_for_internal_LCS,
+    simplify_internal_first_one_round
+)
+from pygsti.tools.dyadickronop import KronStructured
+from pygsti.circuits.split_circuits_into_lanes import (
+    compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit,
+    compute_subcircuits
+)
+
+from scipy.sparse import kron as sparse_kron
+from typing import List, Optional, Iterable, Union, TYPE_CHECKING, Tuple
+from pygsti.tools.tqdm import our_tqdm
+
+LANE = tuple[int, ...]
+
+#region Split circuit list into lists of subcircuits
+
+def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: Union[str, Label] = 'I') -> _Circuit:
+    """
+    Add in explicit idles to the labels for each layer.
+    """
+
+    tmp = circuit.copy(editable=True)
+    num_layers = circuit.num_layers
+
+    for i in range(num_layers):
+        tmp[i] = Label(tmp.layer_label_with_idles(i, idle_gate_name))
+
+    if tmp._static:
+        tmp.done_editing()
+    return tmp
+
+def setup_circuit_list_for_LCS_computations(
+        circuit_list: list[_Circuit],
+        implicit_idle_gate_name: Union[str, Label] = 'I'
+    ) -> tuple[
+        dict[int, dict[int, _Circuit]],
+        dict[LayerTupLike, list[tuple[int, int]]],
+        dict[LANE,list[LayerTupLike]]
+    ]:
+    """
+    Split a circuit list into a list of subcircuits by lanes. These lanes are non-interacting partions of a circuit.
+
+    Also return a sequence detailing the number of lanes in each circuit.
+    Then, a sequence detailing the number of qubits in each lane for a circuit.
+    """
+
+    # We want to split the circuit list into a dictionary of subcircuits where each sub_cir in the dict[key] act exclusively on the same qubits.
+    # I need a mapping from subcircuit to actual circuit. This is uniquely defined by circuit_id and then lane id.
+
+    cir_ind_and_lane_id_to_sub_cir: dict[int, dict[int, _Circuit]] = {}
+    sub_cir_to_cir_id_and_lane_id:  dict[LayerTupLike, list[tuple[int, int]]] = {}
+    line_labels_to_layertup_lists:  dict[LANE, list[LayerTupLike]] = {}
+
+    for i, cir in enumerate(circuit_list):
+
+        if implicit_idle_gate_name:
+            cir = _add_in_idle_gates_to_circuit(cir, implicit_idle_gate_name)
+
+        qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(cir)
+        sub_cirs = compute_subcircuits(cir, qubit_to_lane, lane_to_qubits)
+
+        if not implicit_idle_gate_name:
+            if not all([len(sc) == len(sub_cirs[0]) for sc in sub_cirs]):
+                raise ValueError("Each lane does not have the same number of layers. Therefore, a lane has an implicit idle gate. Please add in idle gates explicitly to the circuit.")
+
+        assert len(sub_cirs) == len(lane_to_qubits)
+        for j in range(len(sub_cirs)):
+            sc = _Circuit(sub_cirs[j],line_labels=tuple(lane_to_qubits[j]),)
+            lbls = sc._line_labels
+            if lbls in line_labels_to_layertup_lists:
+                line_labels_to_layertup_lists[lbls].append(sc.layertup)
+            else:
+                line_labels_to_layertup_lists[lbls] = [sc.layertup]
+            if sc.layertup in sub_cir_to_cir_id_and_lane_id:
+                sub_cir_to_cir_id_and_lane_id[sc.layertup].append((i,j))
+            else:
+                sub_cir_to_cir_id_and_lane_id[sc.layertup] = [(i,j)]
+            if i in cir_ind_and_lane_id_to_sub_cir:
+                cir_ind_and_lane_id_to_sub_cir[i][j] = sc
+            else:
+                cir_ind_and_lane_id_to_sub_cir[i] = {j: sc}
+
+    return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_layertup_lists
+
+#endregion Split Circuits by lanes helpers
+
+def cost_to_compute_tensor_matvec_without_reordering(qubit_list: list[int], total_num_qubits: int) -> int:
+
+    assert _np.sum(qubit_list) == total_num_qubits
+
+    if len(qubit_list) == 1:
+        # Basic matvec.
+        cost = 2 * (4**qubit_list[0]**2)
+        return cost
+    
+    elif len(qubit_list) == 2:
+        # vec((A \tensor B) u) = vec(B U A.T)
+        term1 = 2*(4**qubit_list[1]**2) * (4**qubit_list[0]) # MM of BU.
+        term2 = 2 * (4**qubit_list[0]**2) * (4**qubit_list[1]) # MM of U A.T
+        return term1 + term2
+    
+    else:
+        # Just pop off the last term
+        # (B_1 \tensor B_2 ... \tensor B_n) u = (B_n \tensor B_n-1 ... \tensor B_2) U (B_1).T
+
+        right = cost_to_compute_tensor_matvec_without_reordering(qubit_list[:1], qubit_list[0])
+        right *= 4**(_np.sum(qubit_list[1:]))
+        left = cost_to_compute_tensor_matvec_without_reordering(qubit_list[1:],
+                                                                total_num_qubits - qubit_list[0])
+        left *= 4**(qubit_list[0])
+        return left + right
+
+
+#region Lane Collapsing Helpers
+
+def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: LabelTup, saved: dict[int | LabelTup | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
+    """
+    Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_{i+1}, qu_i).
+    """
+    if op.num_qubits == 2:
+        # We may need to do swaps.
+        op_term : _np.ndarray = _np.array([1.])
+        if op in saved:
+            op_term = saved[op]
+        elif op.qubits[1] < op.qubits[0]:  # type: ignore
+            # This is in the wrong order.
+            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
+            op_term = swap_dense @ (op_term) @ swap_dense.T
+            saved[op] = op_term # Save so we only need to this operation once.
+        else:
+            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
+        return op_term
+    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
+
+def get_dense_op_of_gate_with_perfect_swap_gates(model, op: LabelTup, saved: dict[int | LabelTup | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray):
+    """
+    Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_{i+1}, qu_i).
+    """
+    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
+
+#endregion Lane Collapsing Helpers
+
+
+class EvalTreeBasedUponLongestCommonSubstring():
+    """
+    This class will convert a circuit list into an evaluation cache specifying the order in which to compute
+    the matrix matrix products. We assume that each circuit operates on the same qubits.
+
+    To build the tree we run D rounds of simplification, where D is the length of the longest common subsequence
+    between any pair of circuits or within a circuit.
+
+    In each round of the cache construction, we replace all of the longest common subsequences with a cache number.
+    Before continuing to the next round of the caching algorithm we add the newly chosen subsequences to the list of
+    circuits to cache. Note that cache construction each round can take O(C^2 * L^3) where C is the number of
+    circuits in that round and L is the length of the longest circuit in that round.
+
+    Alternate Constructors:
+        - from_other_eval_tree - Construct new tree from another where there is a mapping from the qubits
+                                active in one to the qubits active in the other.
+
+    Properties:
+        - circuit_to_save_location: dict[tuple[LabelTupTup, ...], int] - maps an initial circuit to its starting location in the cache.
+        - qubit_start_point: int - If this tree is for a LANE of a bigger circuit, where does this lane start.
+        - sequence_intro: dict[int, list[int]] - 
+                a list of indices introduced each round. Note that an cache index, i, which is introduced
+                in the same round as a different cache index, j, cannot depend on j. Each round the indices introduced correspond to
+                longest common sequences that round. Different i, and j, implies different sequences which by construction must be the same length.
+                Therefore, if you have computed all of the indices introduced in the previous rounds you can compute all of the indices in
+                the current round, concurrently.
+
+                This invariant is broken for round 0 if you initially pass 2 or more identical circuits.
+                We do not check if the circuits are identical.
+
+        - cache: dict[int, tuple[Union[LabelTupTup, int], ...]] - the actual tree which will be evaluated to compute the process matrices for each of the circuits.
+                The original circuits are stored in their original order in the keys 0, 1, ..., num_circuits - 1
+        - num_circuits: int - How many circuits are in the unaltered list.
+        
+        - _swap_gate: The two qubit representation of a swap gate which for example would swap the effective region from qubit 0 to qubit 1 in S(Gxpi2:0)S.T .
+
+        - _results: dict[Union[int, LabelTup], _np.ndarray] - dictionary of the process matrices after evaluation with a specific model configuration.
+
+        - cache_ind_to_alphabet_vals_referenced: dict[int, set(LabelTup)] - map from cache index to the gates used in that circuit.
+                TODO: Extract this method to circuit.py
+
+        - alphabet_val_to_sorted_cache_inds: dict[LabelTup, sorted[LANE]] -
+                Which indices do you need to recompute if the model interpretation of the label L changed, but everything else stayed the same.
+
+        - gpindex_to_cache_vals: dict[int, tuple[sorted[list[int]], list[Label]]] - once the cache has been evaluated with a model and you are changing only
+                one parameter at a time, which cache indices do you need to recompute and which labels do you need to reinterpret within the model.
+
+    Methods:
+
+        - collapse_circuits_to_process_matrices - PUBLIC. Evaluate the tree based upon the input model.
+                This will only recompute the portions necessary if the optional index term passed is not None.
+                This method will invalidate the results cache if called with the optional index term passed as None.
+        
+        - _collapse_cache_line - PRIVATE. A line in the cache is a subsequence that needs to be calculated. It may contain more than 2 terms,
+                if the specific subsequence is only referenced once. This method handles reducing the line to a single process matrix and storing
+                the result in the appropriate results cache.
+
+        - _which_full_circuits_will_change_due_to_gpindex_changing - PRIVATE. This populates gpindex_to_cache_vals for a given model.
+
+        - visualize_circuit - PUBLIC. Trace through the tree to see what the circuit is that starts at that location.
+
+        - _handle_results_cache_lookup_and_product - Private. Look up the index in the results cache and multiply with current term.
+
+        - _compute_which_labels_index_depends_on - Private. Used to fill the cache_ind_to_alphabet_vals_referenced dictionary.
+        - flop_cost_of_evaluating_tree - PUBLIC. Given a specific matrix size representation and a specific model, compute how many
+                flops it will take to evaluate the cache assuming that matrix multiplication is the only cost.
+                If you are doing a derivative calculation this will also compute the cost to evaluate the tree in that condition.
+
+
+    Options available during construction
+    """
+
+
+    def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0):
+        """
+        Construct an evaluation order tree for a circuit list that minimizes the number of rounds of computation.
+        """
+
+        self.circuit_to_save_location = {tuple(cir): i for i,cir in enumerate(circuit_list)}
+
+        self.qubit_start_point = qubit_starting_loc
+
+
+        internal_matches = create_tables_for_internal_LCS(circuit_list)
+        best_internal_match = _np.max(internal_matches[0])
+
+        max_rounds = best_internal_match
+
+        self.num_circuits = len(circuit_list)
+
+        sequence_intro = {0: _np.arange(self.num_circuits)}
+
+        cache_pos = self.num_circuits
+        cache = {i: circuit_list[i] for i in range(len(circuit_list))}
+
+        new_circuit_list = [cir for cir in circuit_list] # Get a deep copy since we will modify it here.
+
+        # Let's try simplifying internally first.
+        seq_ind_to_cache_index = {i: i for i in range(self.num_circuits)}
+        
+        best_external_match = _np.max(external_matches[0])
+
+        max_rounds = int(max(best_external_match,best_internal_match))
+        i = 0
+        cache_pos = len(new_circuit_list)
+        while max_rounds > 1:
+            tmp = conduct_one_round_of_lcs_simplification(new_circuit_list,
+                                                          external_matches,
+                                                          internal_matches,
+                                                          cache_pos,
+                                                          cache,
+                                                          seq_ind_to_cache_index)
+            new_circuit_list, cache_pos, cache, sequence_intro[i+1], ext_table, external_sequences, dirty_inds = tmp
+            i += 1
+            dirty_inds = set(_np.arange(len(new_circuit_list))) # TODO: fix to only correct those which are actually dirty.
+            external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list,
+                                                                        ext_table,
+                                                                        external_sequences,
+                                                                        dirty_inds,
+                                                                        max_rounds)
+
+            if best_internal_match < best_external_match and best_external_match < 2 * best_internal_match:
+                # We are not going to get a better internal match.
+                pass
+            elif not self.internal_first:
+                internal_matches = create_tables_for_internal_LCS(new_circuit_list)
+
+            best_external_match = _np.max(external_matches[0])
+            best_internal_match = _np.max(internal_matches[0])
+
+            max_rounds = int(max(best_external_match,best_internal_match))
+
+        self.cache = cache
+        self.from_other = False
+
+        self.sequence_intro = sequence_intro
+
+        from pygsti.modelmembers.operations import StaticStandardOp
+        self._swap_gate = StaticStandardOp('Gswap', basis='pp').to_dense()
+
+        self.cache_ind_to_alphabet_vals_referenced: dict[int, set[LabelTupTup]] = {}
+
+
+        # Useful for repeated calculations seen in a derivative calculation.
+        for key in self.cache:
+            self._compute_which_labels_index_depends_on(key, self.cache_ind_to_alphabet_vals_referenced)
+
+        alphabet_val_to_cache_inds_to_update: dict[LabelTup, set[int]] = {}
+
+        for cache_ind, vals in self.cache_ind_to_alphabet_vals_referenced.items():
+            for val in vals:
+                if isinstance(val, LabelTupTup):
+                    for ind_gate in val:
+                        if ind_gate in alphabet_val_to_cache_inds_to_update:
+                            alphabet_val_to_cache_inds_to_update[ind_gate].add(cache_ind)
+                        else:
+                            alphabet_val_to_cache_inds_to_update[ind_gate] = set([cache_ind])
+                else:
+                    if val in alphabet_val_to_cache_inds_to_update:
+                        alphabet_val_to_cache_inds_to_update[val].add(cache_ind)
+                    else:
+                        alphabet_val_to_cache_inds_to_update[val] = set([cache_ind])
+
+        self._results: dict[Union[int, LabelTupTup], _np.ndarray] = {}
+
+        self.alphabet_val_to_sorted_cache_inds: dict[LabelTup, list[int]] = {}
+
+        self.gpindex_to_cache_vals: dict[int, tuple[list[int], list[Label]]] = {}
+        # This will be filled later by _gpindex_to_cache_inds_needed_to_recompute when we have access to the model.
+        # Warning that changing the model paramvec will result in this cache becoming invalidated.
+        # The user is currently in charge of resetting this cache.
+
+        for val, cache_inds in alphabet_val_to_cache_inds_to_update.items():
+            rnd_nums = {}
+            for cache_ind in cache_inds:
+                for rnd_num in self.sequence_intro:
+                    if cache_ind in self.sequence_intro[rnd_num]:
+                        rnd_nums[cache_ind] = rnd_num
+                        break
+
+            sorted_inds = sorted(cache_inds, key =lambda x : rnd_nums[x])[::-1] # We want to iterate large to small.
+
+            self.alphabet_val_to_sorted_cache_inds[val] = sorted_inds
+
+      
+    def _gpindex_to_cache_inds_needed_to_recompute(self, model, gp_index_changing: int) -> list[int]:
+        """
+        Given that I change the representation of a gate by modifying this index, gp_index_changing,
+        what cache indices do I need to recompute and in what order.
+        """
+        if gp_index_changing in self.gpindex_to_cache_vals:
+            return self.gpindex_to_cache_vals[gp_index_changing]
+
+        cache_inds: list[int] = []
+        all_op_inds: list[int] = []
+        invalid_lbls: list[Label] = []
+        for lbl in self.alphabet_val_to_sorted_cache_inds.keys():
+            # my_op = get_dense_op_of_gate_with_perfect_swap_gates(model, lbl, None, None)
+            try:
+                my_op = model.circuit_layer_operator(lbl, "op") # Assumes that layers have the same gpindices as the gates themselves.
+            except KeyError:
+                # Skip to the next lbl to check. Do not immediately return None!
+                continue
+            op_inds = my_op.gpindices_as_array()
+            if gp_index_changing in op_inds:
+                cache_inds.extend(self.alphabet_val_to_sorted_cache_inds[lbl])
+                invalid_lbls.append(lbl)  # We also invalidate the lbl.
+                all_op_inds.extend(op_inds)
+
+        for ind in all_op_inds:
+            self.gpindex_to_cache_vals[ind] = (cache_inds, invalid_lbls)
+        return cache_inds, invalid_lbls
+    
+    def _which_full_circuits_will_change_due_to_gpindex_changing(self, model, gp_index_changing: int) -> list[int]:
+
+        cache_inds, _ = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+
+        if len(cache_inds) == 0:
+            return []
+        
+        answer = [ind for ind in range(self.num_circuits) if ind in cache_inds]
+        return answer
+
+
+    def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
+        """
+        Construct a tree from another tree.
+        """
+        
+        self.cache = other.cache
+        self.num_circuits = other.num_circuits
+        self.sequence_intro = other.sequence_intro
+        self._swap_gate = other._swap_gate
+        self.circuit_to_save_location = other.circuit_to_save_location
+        self.from_other = other
+
+        for ind in self.cache:
+            for i, term in enumerate(self.cache[ind]):
+                if isinstance(term, int):
+                    pass # The tree will stay the same.
+                elif isinstance(term, LabelTupTup):
+                    new_term = ()
+                    for op in term:
+                        new_qu = (qubit_label_exchange[qu] for qu in op.qubits)
+                        new_op = (op.name, *new_qu)
+                        new_term = (*new_term, new_op)
+                    self.cache[ind][i] = Label(new_term)
+
+        updated = {}
+        for cir, loc in self.circuit_to_save_location.items():
+            new_cir = ()
+            for layer in cir:
+                new_layer = ()
+                for op in layer:
+                    new_op = (op[0], *(qubit_label_exchange[qu] for qu in op[1:]))
+                    new_layer = (*new_layer, new_op)
+                new_cir = (*new_cir, new_layer)
+            updated[new_cir] = loc
+        self.circuit_to_save_location = updated
+
+    def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: int, gp_index_changing: Optional[int] = None):
+        """
+        Compute the total product cache. Note that this may still have a tensor product
+        structure that the operator needs to combine again if they want to have the full 'dense' matrix.
+
+        If gp_index_changing is not None then we have already computed the results once and we only need to update
+        those terms which depend on the specific gp_index.
+        """
+
+        if gp_index_changing is not None:
+
+            # Dig through the tree to see if we have a matching
+
+            cache_inds, invalid_lbls = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+
+            if cache_inds:
+                # Invalidate all gate labels that we saved just in case.
+                # Invalidate every index in the which we know to be influenced by my_op.
+                # local_changes = {k: v for k, v in self._results.items() \
+                #                     if ((k not in cache_inds) and (not isinstance(k, Label)))} # Could just invalidate only the lbl with the index.
+
+                # Iterating over all the cache will take too long.
+                # So we need to handle the invalidness of certain cache inds when we encounter them.
+                local_changes = {}
+
+                # Ignore the last index which is the Label that matched the gpindex.
+                # We assume that only one will match.
+                for cache_ind in cache_inds:
+                    cumulative_term = None
+                    for term in self.cache[cache_ind]:
+                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, local_changes,
+                                                                    num_qubits_in_default, cache_inds, invalid_lbls)
+
+                    # Save locally.
+                    if cumulative_term is None:
+                        local_changes[cache_ind] = _np.eye(4**num_qubits_in_default)
+                        # NOTE: unclear when (if ever) this should be a noisy idle gate.
+                    else:
+                        local_changes[cache_ind] = cumulative_term
+
+                return local_changes, self.circuit_to_save_location
+
+            return self._results, self.circuit_to_save_location
+
+        else:
+            self._results = {} # We are asking to reset all the calculations.
+            round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
+            # saved: dict[Union[int, LabelTupTup], _np.ndarray] = {}
+
+            if self.internal_first:
+
+                round_keys = _np.unique(list(self.sequence_intro.keys()))
+
+                pos_inds = _np.where(round_keys >0)
+                pos_keys = round_keys[pos_inds]
+                pos_keys = sorted(pos_keys)[::-1]
+
+                neg_inds = _np.where(round_keys < 0)
+                neg_keys = round_keys[neg_inds]
+                neg_keys = sorted(neg_keys)
+
+                round_keys = pos_keys + neg_keys + _np.array([0])
+            
+            empty = []
+            for key in round_keys:
+                for cache_ind in self.sequence_intro[key]:
+                    cumulative_term = None
+                    for term in self.cache[cache_ind]:
+                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, self._results,
+                                                                    num_qubits_in_default, empty, empty)
+                            
+                    if cumulative_term is None:
+                        self._results[cache_ind] = _np.eye(4**num_qubits_in_default)
+                        # NOTE: unclear when (if ever) this should be a noisy idle gate.
+                    else:
+                        self._results[cache_ind] = cumulative_term
+        if __debug__:
+            # We may store more in the cache in order to handle multi-qubit gates which are out of the normal order.
+            for key in self.cache:
+                assert key in self._results
+        return self._results, self.circuit_to_save_location
+    
+    def _compute_which_labels_index_depends_on(self, val: Union[int, LabelTupTup], visited: dict[int, set[LabelTupTup]]) -> set[LabelTupTup]:
+        """
+        Determine which labels the specific index in the cache depends on.
+        """
+
+        if not isinstance(val, int):
+            return set([val])
+        elif val in visited:
+            return visited[val]
+        else:
+            tmp = set()
+            for child in self.cache[val]:
+                ret_val = self._compute_which_labels_index_depends_on(child, visited)
+                tmp = tmp.union(ret_val)
+            visited[val] = tmp
+            return tmp
+
+
+    def visualize_circuit(self, val: Union[int, LabelTupTup], visited) -> list[LabelTupTup]:
+        """
+        Recombine circuit by traversing the cache.
+        """
+        if not isinstance(val, int):
+            return [val]
+        elif val in visited:
+            return visited[val]
+        else:
+            tmp = []
+            for child in self.cache[val]:
+                tmp.append(self.visualize_circuit(child, visited))
+            visited[val] = tmp
+            return tmp
+
+    def _handle_results_cache_lookup_and_product(self,
+                            cumulative_term: Optional[_np.ndarray],
+                            term_to_extend_with: Union[int, LabelTupTup],
+                            results_cache: dict[Union[int, LabelTupTup], _np.ndarray]) -> _np.ndarray:
+        """
+        When combining terms in cache for evaluation handle the lookup from the results cache.
+        """
+
+        if cumulative_term is None:
+            return results_cache[term_to_extend_with]
+        return results_cache[term_to_extend_with] @ cumulative_term
+
+
+    def _collapse_cache_line(self, model, cumulative_term: Optional[_np.ndarray],
+                            term_to_extend_with: Union[int, LabelTupTup],
+                            local_results_cache: dict[Union[int, LabelTupTup], _np.ndarray],
+                            num_qubits_in_default: int,
+                            globally_invalid_cache_inds: Optional[list[int]] = None,
+                            globally_invalid_labels: Optional[list[LabelTupTup]] = None
+                            ) -> _np.ndarray:
+        """
+        Reduce a cache line to a single process matrix.
+
+        This should really only be called from collapse_circuits_to_process_matrices.
+        The method will handle looking in the appropriate cache in the case of a derivative
+        approximation.
+        """
+
+        if (term_to_extend_with in local_results_cache):
+            return self._handle_results_cache_lookup_and_product(cumulative_term,
+                                                                term_to_extend_with,
+                                                                local_results_cache)
+        elif isinstance(term_to_extend_with, int) and \
+            (globally_invalid_cache_inds is not None) and \
+            (term_to_extend_with not in globally_invalid_cache_inds) and \
+                (term_to_extend_with in self._results):
+            
+            return self._handle_results_cache_lookup_and_product(cumulative_term,
+                                                                term_to_extend_with,
+                                                                self._results)
+        elif isinstance(term_to_extend_with, LabelTupTup) and \
+            (globally_invalid_labels is not None) and \
+            not (any([t in globally_invalid_labels for t in term_to_extend_with])) \
+                and (term_to_extend_with in self._results):        
+            return self._handle_results_cache_lookup_and_product(cumulative_term,
+                                                                term_to_extend_with,
+                                                                self._results)
+
+        else:
+            val = 1
+            qubits_available = [i + self.qubit_start_point for i in range(num_qubits_in_default)]
+            if isinstance(term_to_extend_with, int):
+                breakpoint()
+            matrix_reps = {op.qubits: get_dense_representation_of_gate_with_perfect_swap_gates(model, op,
+                                            local_results_cache, self._swap_gate) for op in term_to_extend_with}
+            qubit_used = []
+            for key in matrix_reps.keys():
+                qubit_used.extend(key)
+
+            assert len(qubit_used) == len(set(qubit_used))
+            unused_qubits = set(qubits_available) - set(qubit_used)
+
+            implicit_idle_reps = {(qu,): get_dense_representation_of_gate_with_perfect_swap_gates(model,
+                                        Label("Fake_Gate_To_Get_Tensor_Size_Right", qu), # A fake gate to look up and use the appropriate idle gate.
+                                        local_results_cache, self._swap_gate) for qu in unused_qubits}
+
+            while qubits_available:
+
+                qu = qubits_available[0]
+                if qu in unused_qubits:
+                    val = _np.kron(val, implicit_idle_reps[(qu,)])
+                    qubits_available = qubits_available[1:]
+                else:
+                    # It must be a part of a non-trivial gate.
+                    gatekey = [key for key in matrix_reps if qu in key][0]
+                    val = _np.kron(val, matrix_reps[gatekey])
+
+                    qubits_available = qubits_available[len(gatekey):]
+
+            local_results_cache[term_to_extend_with] = val
+            if cumulative_term is None:
+                return val
+            # Cache if off.
+            return local_results_cache[term_to_extend_with] @ cumulative_term
+
+    def flop_cost_of_evaluating_tree(self, matrix_size: tuple[int, int], model = None, gp_index_changing: Optional[int] = None) -> int:
+        """
+        We assume that each matrix matrix multiply is the same size.
+        """
+
+        assert matrix_size[0] == matrix_size[1]
+
+        total_flop_cost = 0
+        if (model is not None) and (gp_index_changing is not None):
+
+            cache_inds, invalid_lbls = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+
+        else:
+            cache_inds = list(self.cache.keys())
+
+        for cache_ind in cache_inds:
+            num_mm_on_this_cache_line = len(self.cache[cache_ind]) - 1
+            total_flop_cost += (2* (matrix_size[0])**3) * num_mm_on_this_cache_line
+
+        return total_flop_cost
+
+
+class CollectionOfLCSEvalTrees():
+    """
+    A collection of LCS Eval Trees is a group of trees designed to evaluate a list of circuits which have been split into lanes.
+    A lane is a set of quibts which do not interact with any qubits in another lane.
+
+    CollectionOfLCSEvalTrees assumes that you have already split the circuits into such lanes.
+
+    - Properties:
+
+        - trees - dict[lane, EvalTreeLCS] - mapping of lane to LCS evaluation tree.
+        - _assume_matching_tree_for_matching_num_qubits - bool - Are we assuming that if two lanes have the same size,
+                then they will have the same distribution of circuits to test. This can dramatically reduce the cost of computing the orderings.
+        - line_lbls_to_circuit_list - dict[LANE, list[LabelTupTup]] - lanes to circuits to evaluate.
+        - sub_cir_to_full_cir_id_and_lane_id - dict[tuple[LabelTupTup], tuple[int, int]] - map from sub circuit to which lane they are from and the whole circuit.
+        - cir_id_and_lane_id_to_sub_cir - reverse of sub_cir_to_full_cir_id_and_lane_id
+        - process_matrices_which_will_need_to_update_for_index - _np.ndarray - (params, trees, all_circuits)
+                - True False map for if that sub circuit needs to be updated if a param changes.
+    
+                
+    - Methods:
+        - collapse_circuits_to_process_matrices: PUBLIC - main method for evaluating the trees.
+        - reconstruct_full_matrices: PUBLIC - combine the lanes for each circuit into something which can be multiplied against full system SPAM layers.
+        - flop_estimate: PUBLIC - compute the number of flops needed to evaluate the tree fully or partially as in the case of a derivative in one direction.
+        - determine_which_circuits_will_update_for_what_gpindices: PUBLIC - learn the circuits which will get modified for a specific model and index change.
+        - compute_tensor_orders: Private - what would be the best tensor orders for all the circuits?
+        - best_order_for_tensor_contraction: Private - for a single circuit?
+        - _tensor_cost_model: Private - The cost to compute A \otimes B as full dense matrix.
+    """
+
+
+    def __init__(self, line_lbls_to_circuit_list: dict[LANE, list[LabelTupTup]],
+                 sub_cir_to_full_cir_id_and_lane_id,
+                 cir_id_and_lane_id_to_sub_cir):
+        
+        self.trees: dict[LANE, EvalTreeBasedUponLongestCommonSubstring] = {}
+
+        self._assume_matching_tree_for_matching_num_qubits = False
+
+        size_to_tree: dict[int, LANE] = {}
+
+        self.line_lbls_to_cir_list = line_lbls_to_circuit_list
+
+        starttime = _time.time()
+        for key, vals in our_tqdm(line_lbls_to_circuit_list.items(), " Building Longest Common Substring Caches"):
+            sub_cirs = []
+            for cir in vals:
+                sub_cirs.append(list(cir))
+            if self._assume_matching_tree_for_matching_num_qubits:
+                if len(key) not in size_to_tree:
+                    self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
+                    size_to_tree[len(key)] = key
+                else:
+                    sample = EvalTreeBasedUponLongestCommonSubstring(sub_cirs[:2]) # Build a small version to be corrected later.
+                    other_key = size_to_tree[len(key)]
+                    sample.from_other_eval_tree(self.trees[other_key], {other_key[i]: key[i] for i in range(len(key))})
+                    self.trees[key] = sample
+            else:
+                self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs, sorted(key)[0])
+
+        endtime = _time.time()
+
+        print(" Time to compute all the evaluation orders (s): ", endtime - starttime)
+
+
+        self.sub_cir_to_full_cir_id_and_lane_id = sub_cir_to_full_cir_id_and_lane_id
+        self.cir_id_and_lane_id_to_sub_cir = cir_id_and_lane_id_to_sub_cir
+
+        self.cir_id_to_tensor_order: dict[int, list[list[int], int]] = {}
+        self.compute_tensor_orders()
+
+        self.saved_results: dict[Union[LabelTupTup, int], _np.ndarray] = {}
+        self.sub_cir_to_ind_in_results: dict[LANE, dict[_Circuit, int]] = {}
+        self.process_matrices_which_will_need_to_update_for_index: _np.ndarray = []
+
+
+    def collapse_circuits_to_process_matrices(self, model, gp_index_changing: Optional[int] = None):
+        """
+        Collapse all circuits to their process matrices. If alphabet_piece_changing is not None, then
+        we assume we have already collapsed this system once before and so only need to update part of the eval tree.
+        """
+        # Just collapse all of them.
+
+        if gp_index_changing is not None:
+            # We may not need to check all of the lanes.
+            pass
+
+        else:
+            self.saved_results = {}
+        
+        for key in self.trees:
+            num_qubits = len(key)
+            tree = self.trees[key]
+            out1, out2 = tree.collapse_circuits_to_process_matrices(model, num_qubits, gp_index_changing)
+            # self.saved_results[key], self.sub_cir_to_ind_in_results[key] = self.trees[key].collapse_circuits_to_process_matrices(model, len(key))
+            self.saved_results[key] = out1
+            self.sub_cir_to_ind_in_results[key] = out2
+
+    def determine_which_circuits_will_update_for_what_gpindices(self, model):
+
+        dirty_circuits = _np.zeros((model.num_params, len(self.trees), len(self.cir_id_and_lane_id_to_sub_cir)))
+        for ind in range(model.num_params):
+
+            for ikey, key in enumerate(self.trees):
+                dirty_circuits[ind, ikey, self.trees[key]._which_full_circuits_will_change_due_to_gpindex_changing(model, ind)] = 1
+
+        self.process_matrices_which_will_need_to_update_for_index = dirty_circuits
+        return dirty_circuits
+
+    def reconstruct_full_matrices(self,
+                                  model = None,
+                                  gp_index_changing: Optional[int] = None) -> \
+                                    Optional[Tuple[List[Union[KronStructured, _np.ndarray]], List[int]]]:
+        """
+        Construct a tensor product structure for each individual circuit
+        """
+
+        if len(self.saved_results) == 0:
+            return
+
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
+        cir_inds = _np.arange(num_cirs, dtype=_np.int32)
+        if (gp_index_changing is not None) and (model is not None):
+            
+            cir_inds = _np.where(_np.sum(self.process_matrices_which_will_need_to_update_for_index[gp_index_changing], axis=0) >= 1)[0] # At least one lane changed.
+
+            lane_key_to_ind: dict[LANE, int] = {key: ikey for ikey, key in enumerate(self.trees)}
+
+            output = []
+
+            for icir in cir_inds:
+                lane_circuits = []
+                for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
+                    cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
+                    lblkey = cir._line_labels
+
+                    ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
+                    if ind_in_results not in self.saved_results[lblkey]:
+                        # We have only the local changes.
+                        # This will be stored in the results file of the subtree.
+                        lane_circuits.append(self.trees[lblkey].results[ind_in_results])
+                    else:
+                        lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+                if len(lane_circuits) > 1:
+                    output.append(KronStructured(lane_circuits))
+                elif len(lane_circuits) == 1:
+                    output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
+                else:
+                    raise ValueError()
+        
+            return output, cir_inds
+
+        else:
+            output = []
+            # Now we can do the combination.
+
+            for icir in cir_inds:
+                lane_circuits = []
+                for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
+                    cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
+                    lblkey = cir._line_labels
+
+                    ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
+                    if ind_in_results not in self.saved_results[lblkey]:
+                        # We have only the local changes.
+                        # This will be stored in the results file of the subtree.
+                        lane_circuits.append(self.trees[lblkey].results[ind_in_results])
+                    else:
+                        lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+                if len(lane_circuits) > 1:
+                    output.append(KronStructured(lane_circuits))
+                elif len(lane_circuits) == 1:
+                    output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
+                else:
+                    raise ValueError()
+            
+        return output, cir_inds
+
+    def flop_estimate(self, return_collapse: bool = False, return_tensor_matvec: bool = False, model = None, gp_index_changing: Optional[int] = None):
+
+
+        cost_collapse = 0
+        for key in self.trees:
+            num_qubits = len(key) if key[0] != ('*',) else key[1] # Stored in the data structure.
+            tree = self.trees[key]
+            cost_collapse += tree.flop_cost_of_evaluating_tree(tuple([4**num_qubits, 4**num_qubits]), model, gp_index_changing)
+        
+
+        tensor_cost = 0
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
+        cir_inds = _np.arange(num_cirs, dtype=_np.int32)
+
+        if (model is not None) and (gp_index_changing is not None):
+
+            dirty_circuits = self.determine_which_circuits_will_update_for_what_gpindices(model)
+            cir_inds = _np.where(_np.sum(self.process_matrices_which_will_need_to_update_for_index[gp_index_changing], axis=0) >= 1)[0] # At least one lane changed.
+
+        
+        for cir_id in cir_inds:
+            qubit_list = ()
+            for lane_id in range(len(self.cir_id_and_lane_id_to_sub_cir[cir_id])):
+                subcir = self.cir_id_and_lane_id_to_sub_cir[cir_id][lane_id]
+                qubit_list = (*qubit_list, len(subcir._line_labels))
+            qubit_list = list(qubit_list)
+            total_num = _np.sum(qubit_list)
+
+            tensor_cost += cost_to_compute_tensor_matvec_without_reordering(qubit_list, total_num)
+
+        if return_collapse:
+            return tensor_cost + cost_collapse, cost_collapse
+        elif return_tensor_matvec:
+            return tensor_cost + cost_collapse, tensor_cost
+        elif gp_index_changing is not None:
+            return tensor_cost + cost_collapse, len(cir_inds)  # Since you are not updating all of the representations we do not need to update the state props either for those.
+
+        return tensor_cost + cost_collapse
+
+    def compute_tensor_orders(self):
+
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
+
+        cache_struct = {}
+
+        for cir_id in range(num_cirs):
+            qubit_list = ()
+            for lane_id in range(len(self.cir_id_and_lane_id_to_sub_cir[cir_id])):
+                subcir = self.cir_id_and_lane_id_to_sub_cir[cir_id][lane_id]
+                qubit_list = (*qubit_list, len(subcir._line_labels))
+            self.cir_id_to_tensor_order[cir_id] = self.best_order_for_tensor_contraction(qubit_list, cache_struct)
+
+        return
+            
+    def best_order_for_tensor_contraction(self,
+                    qubit_list: LANE,
+                    cache: dict[LANE, tuple[list[int], int]]) -> tuple[list[int], int]:
+        """
+        Find the tensor contraction order that minizes the cost of contracting to a dense system with
+        a total number of qubits equal to the len(qubit_list)
+        """
+        
+
+        if qubit_list in cache:
+            return cache[qubit_list]
+
+        best_cost = _np.inf
+        best_order = []
+
+        for order in itertools.permutations(range(len(qubit_list)-1), len(qubit_list)-1):
+
+            my_list = [qb for qb in qubit_list] # force deep copy.
+            my_starting_points = [sp for sp in order]
+            cost = 0
+            early_exit = False
+            while my_starting_points and not early_exit:
+                sp = my_starting_points.pop(0)
+
+                cost += self._tensor_cost_model(my_list[sp], my_list[sp+1])
+                if cost <= best_cost:
+                    # modify sp for future.
+                    tmp = []
+                    for new_val in my_starting_points:
+                        tmp.append((new_val - 1)*(new_val > sp) + (new_val) * (new_val < sp))
+                    my_starting_points = tmp
+
+                    q2 = my_list.pop(sp+1)
+                    my_list[sp] += q2
+                else:
+                    early_exit = True # This round is done because the partial sum was too big.
+
+            if cost < best_cost and not early_exit:
+                best_cost = cost
+                best_order = list(order)
+
+        # Store off the information.
+        cache[qubit_list] = best_order, best_cost
+
+        return best_order, best_cost
+
+    def _tensor_cost_model(self, num_qubits1, num_qubits2):
+        """
+        Assumes kronecker product of 2 square matrices.
+        """
+
+        return (4**num_qubits1)**2 * (4**num_qubits2)**2
diff --git a/pygsti/layouts/matrixlayout.py b/pygsti/layouts/matrixlayout.py
index 7642978e5..eac3df439 100644
--- a/pygsti/layouts/matrixlayout.py
+++ b/pygsti/layouts/matrixlayout.py
@@ -10,18 +10,20 @@
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
 
-import collections as _collections
 
 import numpy as _np
 
 from pygsti.layouts.distlayout import DistributableCOPALayout as _DistributableCOPALayout
 from pygsti.layouts.distlayout import _DistributableAtom
-from pygsti.layouts.evaltree import EvalTree as _EvalTree
+from pygsti.layouts.evaltree import (
+    CollectionOfLCSEvalTrees as _CollectionOfLCSEvalTrees,
+    EvalTree as _EvalTree,
+    setup_circuit_list_for_LCS_computations as _setup_circuit_list_for_LCS_computations,
+)
 from pygsti.circuits.circuitlist import CircuitList as _CircuitList
 from pygsti.tools import listtools as _lt
 from pygsti.tools import slicetools as _slct
 
-
 class _MatrixCOPALayoutAtom(_DistributableAtom):
     """
     The atom ("atomic unit") for dividing up the element dimension in a :class:`MatrixCOPALayout`.
@@ -100,7 +102,7 @@ def add_expanded_circuits(indices, add_to_this_dict):
 
                         #Now add these outcomes to `expanded_nospam_circuit_outcomes` - note that multiple "unique_i"'s
                         # may exist for the same expanded & without-spam circuit (exp_nospam_c) and so we need to
-                        # keep track of a list of unique_i indices for each circut and spam tuple below.
+                        # keep track of a list of unique_i indices for each circuit and spam tuple below.
                         if exp_nospam_c not in _expanded_nospam_circuit_outcomes:
                             _expanded_nospam_circuit_outcomes[exp_nospam_c] = {st:(outcome, [unique_i]) for st, outcome in zip(spam_tuples, outcomes)}
                         else:
@@ -138,6 +140,7 @@ def add_expanded_circuits(indices, add_to_this_dict):
             if double_expanded_ckt is None: #Fall back to standard behavior and do expansion.
                 double_expanded_ckt = cir.expand_subcircuits()
             double_expanded_nospam_circuits_plus_scratch[i] = double_expanded_ckt
+
         self.tree = _EvalTree.create(double_expanded_nospam_circuits_plus_scratch)
         #print("Atom tree: %d circuits => tree of size %d" % (len(expanded_nospam_circuits), len(self.tree)))
 
@@ -151,7 +154,214 @@ def add_expanded_circuits(indices, add_to_this_dict):
         tree_indices_by_spamtuple = dict()  # "tree" indices index expanded_nospam_circuits
         for i, c in expanded_nospam_circuits.items():
             for spam_tuple in expanded_nospam_circuit_outcomes[c].keys():
-                if spam_tuple not in tree_indices_by_spamtuple: tree_indices_by_spamtuple[spam_tuple] = []
+                if spam_tuple not in tree_indices_by_spamtuple:
+                    tree_indices_by_spamtuple[spam_tuple] = []
+                tree_indices_by_spamtuple[spam_tuple].append(i)
+
+        #Assign element indices, starting at `offset`
+        # now that we know how many of each spamtuple there are, assign final element indices.
+        local_offset = 0
+        self.indices_by_spamtuple = dict()  # values are (element_indices, tree_indices) tuples.
+        for spam_tuple, tree_indices in tree_indices_by_spamtuple.items():
+            self.indices_by_spamtuple[spam_tuple] = (slice(local_offset, local_offset + len(tree_indices)),
+                                                     _slct.list_to_slice(tree_indices, array_ok=True))
+            local_offset += len(tree_indices)
+            #TODO: allow tree_indices to be None or a slice?
+
+        element_slice = None  # slice(offset, offset + local_offset)  # *global* (of parent layout) element-index slice
+        num_elements = local_offset
+
+        elindex_outcome_tuples = {unique_i: list() for unique_i in range(len(unique_complete_circuits))}
+
+        for spam_tuple, (element_indices, tree_indices) in self.indices_by_spamtuple.items():
+            for elindex, tree_index in zip(_slct.indices(element_indices), _slct.to_array(tree_indices)):
+                outcome_by_spamtuple = expanded_nospam_circuit_outcomes[expanded_nospam_circuits[tree_index]]
+                outcome, unique_is = outcome_by_spamtuple[spam_tuple]
+                for unique_i in unique_is:
+                    elindex_outcome_tuples[unique_i].append((elindex, outcome))  # *local* element indices
+        self.elindex_outcome_tuples = elindex_outcome_tuples
+
+        super().__init__(element_slice, num_elements)
+
+    def nonscratch_cache_view(self, a, axis=None):
+        """
+        Create a view of array `a` restricting it to only the *final* results computed by this tree.
+
+        This need not be the entire array because there could be intermediate results
+        (e.g. "scratch space") that are excluded.
+
+        Parameters
+        ----------
+        a : ndarray
+            An array of results computed using this EvalTree,
+            such that the `axis`-th dimension equals the full
+            length of the tree.  The other dimensions of `a` are
+            unrestricted.
+
+        axis : int, optional
+            Specified the axis along which the selection of the
+            final elements is performed. If None, than this
+            selection if performed on flattened `a`.
+
+        Returns
+        -------
+        ndarray
+            Of the same shape as `a`, except for along the
+            specified axis, whose dimension has been reduced
+            to filter out the intermediate (non-final) results.
+        """
+        if axis is None:
+            return a[0:self._num_nonscratch_tree_items]
+        else:
+            sl = [slice(None)] * a.ndim
+            sl[axis] = slice(0, self._num_nonscratch_tree_items)
+            ret = a[tuple(sl)]
+            assert(ret.base is a or ret.base is a.base)  # check that what is returned is a view
+            assert(ret.size == 0 or _np.may_share_memory(ret, a))
+            return ret
+
+    @property
+    def cache_size(self):
+        """The cache size of this atom."""
+        return len(self.tree)
+    
+
+class _MatrixCOPALayoutAtomWithLCS(_DistributableAtom):
+    """
+    The atom ("atomic unit") for dividing up the element dimension in a :class:`MatrixCOPALayout`.
+
+    Parameters
+    ----------
+    unique_complete_circuits : list
+        A list that contains *all* the "complete" circuits for the parent layout.  This
+        atom only owns a subset of these, as given by `group` below.
+
+    unique_nospam_circuits : list
+        A list that contains the unique circuits within `unique_complete_circuits` once
+        their state preparations and measurements are removed.  A subset of these circuits
+        (see `group` below) are what fundamentally define the circuit outcomes that this atom
+        includes: it includes *all* the circuit outcomes of those circuits.
+
+    circuits_by_unique_nospam_circuits : dict
+       A dictionary with keys equal to the elements of `unique_nospam_circuits` and values
+       that are lists of indices into `unique_complete_circuits`.  Thus, this dictionary
+       maps each distinct circuit-without-SPAM circuit to the list of complete circuits
+       within `unique_complete_circuits` that correspond to it.
+
+    ds_circuits : list
+        A list of circuits parallel to `unique_complete_circuits` of these circuits
+        as they should be accessed from `dataset`.  This applies any aliases and
+        removes implied SPAM elements relative to `unique_complete_circuits`.
+
+    group : set
+        The set of indices into `unique_nospam_circuits` that define the circuit
+        outcomes owned by this atom.
+
+    helpful_scratch : set
+        A set of indices into `unique_nospam_circuits` that specify circuits that
+        aren't owned by this atom but are helpful in building up an efficient evaluation
+        tree.
+
+    model : Model
+        The model being used to construct this layout.  Used for expanding instruments
+        within the circuits.
+    
+    unique_circuits : list of Circuits
+        A list of the unique :class:`Circuit` objects representing the circuits this layout will include.
+
+    dataset : DataSet
+        The dataset, used to include only observed circuit outcomes in this atom
+        and therefore the parent layout.
+    """
+
+    def __init__(self, unique_complete_circuits, unique_nospam_circuits, circuits_by_unique_nospam_circuits,
+                 ds_circuits, group, helpful_scratch, model, unique_circuits, dataset=None, expanded_and_separated_circuit_cache=None,
+                 double_expanded_nospam_circuits_cache = None, implicit_idle_gate = None):
+
+        if expanded_and_separated_circuit_cache is None:
+            expanded_and_separated_circuit_cache = dict()
+
+        #Note: group gives unique_nospam_circuits indices, which circuits_by_unique_nospam_circuits
+        # turns into "unique complete circuit" indices, which the layout via it's to_unique can map
+        # to original circuit indices.
+        def add_expanded_circuits(indices, add_to_this_dict):
+            _expanded_nospam_circuit_outcomes = add_to_this_dict
+            for i in indices:
+                nospam_c = unique_nospam_circuits[i]
+                for unique_i in circuits_by_unique_nospam_circuits[nospam_c]:  # "unique" circuits: add SPAM to nospam_c
+                    #the cache is indexed into using the (potentially) incomplete circuits
+                    expc_outcomes = expanded_and_separated_circuit_cache.get(unique_circuits[unique_i], None)
+                    if expc_outcomes is None: #fall back on original non-cache behavior.
+                        observed_outcomes = None if (dataset is None) else dataset[ds_circuits[unique_i]].unique_outcomes
+                        expc_outcomes = model.expand_instruments_and_separate_povm(unique_complete_circuits[unique_i], observed_outcomes)
+                        #and add this new value to the cache.
+                        expanded_and_separated_circuit_cache[unique_circuits[unique_i]] = expc_outcomes 
+                    for sep_povm_c, outcomes in expc_outcomes.items():  # for each expanded cir from unique_i-th circuit
+                        prep_lbl = sep_povm_c.circuit_without_povm[0]
+                        exp_nospam_c = sep_povm_c.circuit_without_povm[1:]  # sep_povm_c *always* has prep lbl
+                        spam_tuples = [(prep_lbl, elabel) for elabel in sep_povm_c.full_effect_labels]
+                        outcome_by_spamtuple = {st:outcome for st, outcome in zip(spam_tuples, outcomes)}
+
+                        #Now add these outcomes to `expanded_nospam_circuit_outcomes` - note that multiple "unique_i"'s
+                        # may exist for the same expanded & without-spam circuit (exp_nospam_c) and so we need to
+                        # keep track of a list of unique_i indices for each circuit and spam tuple below.
+                        if exp_nospam_c not in _expanded_nospam_circuit_outcomes:
+                            _expanded_nospam_circuit_outcomes[exp_nospam_c] = {st:(outcome, [unique_i]) for st, outcome in zip(spam_tuples, outcomes)}
+                        else:
+                            for st, outcome in outcome_by_spamtuple.items():
+                                if st in _expanded_nospam_circuit_outcomes[exp_nospam_c]:
+                                    existing_outcome, existing_unique_is = \
+                                        _expanded_nospam_circuit_outcomes[exp_nospam_c][st]
+                                    assert(existing_outcome == outcome), "Outcome should be same when spam tuples are!"
+                                    assert(unique_i not in existing_unique_is)  # SLOW - remove?
+                                    existing_unique_is.append(unique_i)
+                                else:
+                                    _expanded_nospam_circuit_outcomes[exp_nospam_c][st] = (outcome, [unique_i])
+
+        # keys = expanded circuits w/out SPAM layers; values = spamtuple => (outcome, unique_is) dictionary that
+        # keeps track of which "unique" circuit indices having each spamtuple / outcome.
+        expanded_nospam_circuit_outcomes = dict()
+        add_expanded_circuits(group, expanded_nospam_circuit_outcomes)
+        expanded_nospam_circuits = {i:cir for i, cir in enumerate(expanded_nospam_circuit_outcomes.keys())}
+
+        # add suggested scratch to the "final" elements as far as the tree creation is concerned
+        # - this allows these scratch element to help balance the tree.
+        if helpful_scratch:
+            expanded_nospam_circuit_outcomes_plus_scratch = expanded_nospam_circuit_outcomes.copy()
+            add_expanded_circuits(helpful_scratch, expanded_nospam_circuit_outcomes_plus_scratch)
+            expanded_nospam_circuits_plus_scratch = {i:cir for i, cir in enumerate(expanded_nospam_circuit_outcomes_plus_scratch.keys())}
+        else:
+            expanded_nospam_circuits_plus_scratch = expanded_nospam_circuits.copy()
+        
+        if double_expanded_nospam_circuits_cache is None:
+            double_expanded_nospam_circuits_cache = dict()
+        double_expanded_nospam_circuits_plus_scratch = dict()
+        for i, cir in expanded_nospam_circuits_plus_scratch.items():
+            # expand sub-circuits for a more efficient tree
+            double_expanded_ckt = double_expanded_nospam_circuits_cache.get(cir, None)
+            if double_expanded_ckt is None: #Fall back to standard behavior and do expansion.
+                double_expanded_ckt = cir.expand_subcircuits()
+            double_expanded_nospam_circuits_plus_scratch[i] = double_expanded_ckt
+
+        vals = list(double_expanded_nospam_circuits_plus_scratch.values())
+        
+        cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list = _setup_circuit_list_for_LCS_computations(vals, implicit_idle_gate)
+        self.tree = _CollectionOfLCSEvalTrees(line_labels_to_circuit_list, sub_cir_to_cir_id_and_lane_id, cir_ind_and_lane_id_to_sub_cir)
+      
+        #print("Atom tree: %d circuits => tree of size %d" % (len(expanded_nospam_circuits), len(self.tree)))
+
+        self._num_nonscratch_tree_items = len(expanded_nospam_circuits)  # put this in EvalTree?
+
+        # self.tree's elements give instructions for evaluating ("caching") no-spam quantities (e.g. products).
+        # Now we assign final element indices to the circuit outcomes corresponding to a given no-spam ("tree")
+        # quantity plus a spam-tuple. We order the final indices so that all the outcomes corresponding to a
+        # given spam-tuple are contiguous.
+
+        tree_indices_by_spamtuple = dict()  # "tree" indices index expanded_nospam_circuits
+        for i, c in expanded_nospam_circuits.items():
+            for spam_tuple in expanded_nospam_circuit_outcomes[c].keys():
+                if spam_tuple not in tree_indices_by_spamtuple:
+                    tree_indices_by_spamtuple[spam_tuple] = []
                 tree_indices_by_spamtuple[spam_tuple].append(i)
 
         #Assign element indices, starting at `offset`
@@ -177,6 +387,14 @@ def add_expanded_circuits(indices, add_to_this_dict):
                     elindex_outcome_tuples[unique_i].append((elindex, outcome))  # *local* element indices
         self.elindex_outcome_tuples = elindex_outcome_tuples
 
+
+        print("Flop cost to evaluate the tree once: ", self.tree.flop_estimate())
+        
+        # num_circs = len(cir_ind_and_lane_id_to_sub_cir)
+        # num_rho_and_em = len(self.indices_by_spamtuple.keys())
+        # num_qubits_in_circuit = unique_circuits[0].num_lines
+        # print("Flop cost for <p_i G Em>: ", (2*(4**num_qubits_in_circuit)**2)*num_circs*num_rho_and_em)
+
         super().__init__(element_slice, num_elements)
 
     def nonscratch_cache_view(self, a, axis=None):
@@ -261,7 +479,7 @@ class MatrixCOPALayout(_DistributableCOPALayout):
         A 1- or 2-tuple of integers specifying how many parameter-block processors are
         used when dividing the physical processors into a grid.  The first and second
         elements correspond to counts for the first and second parameter dimensions,
-        respecively.
+        respectively.
 
     param_dimensions : tuple, optional
         The number of parameters along each parameter dimension.  Can be an
@@ -286,10 +504,19 @@ class MatrixCOPALayout(_DistributableCOPALayout):
         circuits. I.e. circuits with prep labels and POVM labels appended.
     """
 
-    def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_processors=1,
+    def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_processors=2,
                  num_param_dimension_processors=(), param_dimensions=(),
                  param_dimension_blk_sizes=(), resource_alloc=None, verbosity=0, 
-                 layout_creation_circuit_cache = None):
+                 layout_creation_circuit_cache = None, use_old_tree_style: bool = True):
+        
+        if not use_old_tree_style:
+            # NOTE: Error out if we are using new tree and have an explicit op model. Explain why this is bad.
+            from pygsti.models import ExplicitOpModel, ImplicitOpModel
+            if isinstance(model, ExplicitOpModel):
+                raise ValueError(f"Model: {model.__class__} does not support creation of embedded op process matrices." +
+                    "One needs to be able to create the smallest representation possible a 4x4 matrix for a gate acting on a single qubit. In the case of a two qubit system (Gxpi2, 1) in the model could return a 16x16 matrix." +
+                    "This indicates that it was actually acting on the full two qubit system. We assume in the lane splitting algorithm, that the label 1 indicates it will only act on a single qubit specifically qubit 1." +
+                    f"To remedy this situation please convert the model to a subclass of {ImplicitOpModel}.")
 
         #OUTDATED: TODO - revise this:
         # 1. pre-process => get complete circuits => spam-tuples list for each no-spam circuit (no expanding yet)
@@ -298,7 +525,7 @@ def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_p
         #    - heuristically find groups of circuits that meet criteria
         # 3. separately create a tree of no-spam expanded circuits originating from each group => self.atoms
         # 4. assign "cache" and element indices so that a) all elements of a tree are contiguous
-        #    and b) elements with the same spam-tuple are continguous.
+        #    and b) elements with the same spam-tuple are contiguous.
         # 5. initialize base class with given per-original-circuit element indices.
 
         unique_circuits, to_unique = self._compute_unique_circuits(circuits)
@@ -368,13 +595,27 @@ def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_p
 
         def _create_atom(args):
             group, helpful_scratch_group = args
-            return _MatrixCOPALayoutAtom(unique_complete_circuits, unique_nospam_circuits,
+            if use_old_tree_style:
+                return _MatrixCOPALayoutAtom(unique_complete_circuits, unique_nospam_circuits,
                                          circuits_by_unique_nospam_circuits, ds_circuits,
                                          group, helpful_scratch_group, model, 
                                          unique_circuits, dataset,
                                          self.expanded_and_separated_circuits_cache,
                                          self.expanded_subcircuits_no_spam_cache)
 
+            gatename = None
+            if hasattr(model._layer_rules, "_singleq_idle_layer_labels"):
+                if model._layer_rules._singleq_idle_layer_labels:
+                    keys = list(model._layer_rules._singleq_idle_layer_labels.keys())
+                    if model._layer_rules.implicit_idle_mode == "pad_1Q":
+                        gatename = model._layer_rules._singleq_idle_layer_labels[keys[0]].name
+            return _MatrixCOPALayoutAtomWithLCS(unique_complete_circuits, unique_nospam_circuits,
+                                         circuits_by_unique_nospam_circuits, ds_circuits,
+                                         group, helpful_scratch_group, model, 
+                                         unique_circuits, dataset,
+                                         self.expanded_and_separated_circuits_cache,
+                                         self.expanded_subcircuits_no_spam_cache, implicit_idle_gate=gatename)
+
         super().__init__(circuits, unique_circuits, to_unique, unique_complete_circuits,
                          _create_atom, list(zip(groups, helpful_scratch)), num_tree_processors,
                          num_param_dimension_processors, param_dimensions,
diff --git a/pygsti/layouts/prefixtable.py b/pygsti/layouts/prefixtable.py
index 8049c4b15..9e0135db8 100644
--- a/pygsti/layouts/prefixtable.py
+++ b/pygsti/layouts/prefixtable.py
@@ -15,6 +15,7 @@
 from math import ceil
 from pygsti.baseobjs import Label as _Label
 from pygsti.circuits.circuit import SeparatePOVMCircuit as _SeparatePOVMCircuit
+from pygsti.tools.tqdm import our_tqdm
 
 
 class PrefixTable(object):
@@ -685,11 +686,10 @@ def _cache_hits(circuit_reps, circuit_lengths):
     # Not: this logic could be much better, e.g. computing a cost savings for each
     #  potentially-cached item and choosing the best ones, and proper accounting
     #  for chains of cached items.
-    
     cacheIndices = []  # indices into circuits_to_evaluate of the results to cache
     cache_hits = [0]*len(circuit_reps)
 
-    for i in range(len(circuit_reps)):
+    for i in our_tqdm(range(len(circuit_reps)), 'Prefix table : _cache_hits '):
         circuit = circuit_reps[i] 
         L = circuit_lengths[i]  # can be a Circuit or a label tuple
         for cached_index in reversed(cacheIndices):
@@ -707,10 +707,11 @@ def _build_table(sorted_circuits_to_evaluate, cache_hits, max_cache_size, circui
 
     # Build prefix table: construct list, only caching items with hits > 0 (up to max_cache_size)
     cacheIndices = []  # indices into circuits_to_evaluate of the results to cache
-    table_contents = [None]*len(sorted_circuits_to_evaluate)
+    num_sorted_circuits = len(sorted_circuits_to_evaluate)
+    table_contents = [None]*num_sorted_circuits
     curCacheSize = 0
-    for j, (i, _) in zip(orig_indices,enumerate(sorted_circuits_to_evaluate)):
-        
+    for i in our_tqdm(range(num_sorted_circuits), 'Prefix table : _build_table '):
+        j = orig_indices[i]
         circuit_rep = circuit_reps[i] 
         L = circuit_lengths[i]
 
diff --git a/pygsti/modelmembers/modelmember.py b/pygsti/modelmembers/modelmember.py
index 11a4f25e9..9b9937b56 100644
--- a/pygsti/modelmembers/modelmember.py
+++ b/pygsti/modelmembers/modelmember.py
@@ -335,7 +335,7 @@ def unlink_parent(self, force=False):
         None
         """
         for subm in self.submembers():
-            subm.unlink_parent()
+            subm.unlink_parent(force)
 
         if (self.parent is not None) and (force or self.parent._obj_refcount(self) == 0):
             self._parent = None
diff --git a/pygsti/modelmembers/operations/__init__.py b/pygsti/modelmembers/operations/__init__.py
index d0f340c4f..159bb1d93 100644
--- a/pygsti/modelmembers/operations/__init__.py
+++ b/pygsti/modelmembers/operations/__init__.py
@@ -31,6 +31,7 @@
 from .linearop import finite_difference_deriv_wrt_params, finite_difference_hessian_wrt_params
 from .lpdenseop import LinearlyParamArbitraryOp
 from .opfactory import OpFactory, EmbeddedOpFactory, EmbeddingOpFactory, ComposedOpFactory
+from .permutationop import PermutationOperator
 from .repeatedop import RepeatedOp
 from .staticarbitraryop import StaticArbitraryOp
 from .staticcliffordop import StaticCliffordOp
diff --git a/pygsti/modelmembers/operations/opfactory.py b/pygsti/modelmembers/operations/opfactory.py
index cbaed54c3..0719e2a75 100644
--- a/pygsti/modelmembers/operations/opfactory.py
+++ b/pygsti/modelmembers/operations/opfactory.py
@@ -9,6 +9,7 @@
 # in compliance with the License.  You may obtain a copy of the License at
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
+from __future__ import annotations
 import numpy as _np
 
 from pygsti.modelmembers.operations.staticunitaryop import StaticUnitaryOp as _StaticUnitaryOp
@@ -24,9 +25,10 @@
 from pygsti.baseobjs import basis as _basis
 from pygsti.evotypes import Evotype as _Evotype
 from pygsti.tools import optools as _ot
+from pygsti.modelmembers.operations.linearop import LinearOperator as _LinearOperator
 
 
-def op_from_factories(factory_dict, lbl):
+def op_from_factories(factory_dict: dict[_Lbl, OpFactory], lbl: _Lbl) -> _LinearOperator:
     """
     Create an operator for `lbl` from the factories in `factory_dict`.
 
diff --git a/pygsti/models/explicitmodel.py b/pygsti/models/explicitmodel.py
index 8479e3bb1..f677057f5 100644
--- a/pygsti/models/explicitmodel.py
+++ b/pygsti/models/explicitmodel.py
@@ -35,7 +35,7 @@
 from pygsti.modelmembers.operations import opfactory as _opfactory
 from pygsti.baseobjs.basis import Basis as _Basis
 from pygsti.baseobjs.basis import BuiltinBasis as _BuiltinBasis, DirectSumBasis as _DirectSumBasis
-from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel
+from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel, LabelTup as _LabelTup
 from pygsti.baseobjs import statespace as _statespace
 from pygsti.tools import basistools as _bt
 from pygsti.tools import jamiolkowski as _jt
@@ -47,6 +47,33 @@
 from pygsti import SpaceT
 from pygsti.tools.legacytools import deprecate as _deprecated_fn
 
+from pygsti.modelmembers.operations import EmbeddedOp as _EmbeddedOp, ComposedOp as _ComposedOp
+
+
+class Roster:
+    def __init__(self, arg):
+        if isinstance(arg, str) and arg == 'all':
+            self.trivial = True
+            self.collection = None
+        else:
+            self.trivial = False
+            self.collection = arg
+    def __contains__(self, item):
+        return self.trivial or (item in self.collection)
+
+
+class ModelView:
+    @staticmethod
+    def cast(arg):
+        return arg if arg is not None else ModelView()
+    def __init__(self):
+        self.operations  = _collections.defaultdict(lambda: None)
+        self.preps       = _collections.defaultdict(lambda: None)
+        self.povms       = _collections.defaultdict(lambda: None)
+        self.instruments = _collections.defaultdict(lambda: None)
+        self.factories   = _collections.defaultdict(lambda: None)
+
+
 
 class ExplicitOpModel(_mdl.OpModel):
     """
@@ -331,32 +358,56 @@ def __getitem__(self, label):
         else:
             raise KeyError("Key %s has an invalid prefix" % label)
 
-    def convert_members_inplace(self, to_type, categories_to_convert='all', labels_to_convert='all',
-                                ideal_model=None, flatten_structure=False, set_default_gauge_group=False, cptp_truncation_tol= 1e-7, spam_cp_penalty = 1e-7):
+    def convert_members_inplace(self, to_type,
+            categories_to_convert='all', labels_to_convert='all',
+            ideal_model=None, flatten_structure=False, set_default_gauge_group=False,
+            cptp_truncation_tol= 1e-7, spam_cp_penalty = 1e-7, allow_smaller_pp_basis=False
+        ):
         """
         TODO: docstring -- like set_all_parameterizations but doesn't set default gauge group by default
+
+        allow_smaller_pp_basis : bool
+
+            Setting allow_smaller_pp_basis=True allows dimension mismatches between
+            this ExplicitOpModel's operations and the dimensions we'd expect for
+            operations based on the properties of self.basis.
+            
+            We can ONLY handle mismatches when self.basis.name indicates a Pauli product basis
+            or a tensor product thereof. We handle a failed conversion by trying a second time,
+            passing in the string literal `pp` instead of `self.basis` to _op.convert(...).
+    
         """
         if isinstance(categories_to_convert, str): categories_to_convert = (categories_to_convert,)
+        assert all(c in ['all', 'ops', 'operations', 'instruments', 'preps', 'povms'] for c in categories_to_convert)
+        fallback_basis = '' if not allow_smaller_pp_basis else self.basis.name.replace('pp','').replace('*','') + 'pp'
+        ideal_model = ModelView.cast(ideal_model)
+        roster = Roster(labels_to_convert)
         if any([c in categories_to_convert for c in ('all', 'ops', 'operations')]):
-            for lbl, gate in self.operations.items():
-                if labels_to_convert == 'all' or lbl in labels_to_convert:
-                    ideal = ideal_model.operations.get(lbl, None) if (ideal_model is not None) else None
-                    self.operations[lbl] = _op.convert(gate, to_type, self.basis, ideal, flatten_structure, cptp_truncation_tol)
+            op_items = [(k,v) for (k,v) in self.operations.items() if k in roster]
+            for lbl, gate in op_items:
+                ideal = ideal_model.operations[lbl]
+                try:
+                    op = _op.convert(gate, to_type, self.basis, ideal, flatten_structure, cptp_truncation_tol)
+                except ValueError as e:
+                    if not fallback_basis == 'pp':
+                        raise e
+                    op = _op.convert(gate, to_type, 'pp', ideal, flatten_structure, cptp_truncation_tol)
+                self.operations[lbl] = op
         if any([c in categories_to_convert for c in ('all', 'instruments')]):
-            for lbl, inst in self.instruments.items():
-                if labels_to_convert == 'all' or lbl in labels_to_convert:
-                    ideal = ideal_model.instruments.get(lbl, None) if (ideal_model is not None) else None
-                    self.instruments[lbl] = _instrument.convert(inst, to_type, self.basis, ideal, flatten_structure)
+            inst_items = [(k,v) for (k,v) in self.instruments.items() if k in roster]
+            for lbl, inst in inst_items:
+                ideal = ideal_model.instruments[lbl]
+                self.instruments[lbl] = _instrument.convert(inst, to_type, self.basis, ideal, flatten_structure)
         if any([c in categories_to_convert for c in ('all', 'preps')]):
-            for lbl, prep in self.preps.items():
-                if labels_to_convert == 'all' or lbl in labels_to_convert:
-                    ideal = ideal_model.preps.get(lbl, None) if (ideal_model is not None) else None
-                    self.preps[lbl] = _state.convert(prep, to_type, self.basis, ideal, flatten_structure, cp_penalty=spam_cp_penalty)
+            prep_items = [(k,v) for (k,v) in self.preps.items() if k in roster]
+            for lbl, prep in prep_items:
+                ideal = ideal_model.preps[lbl]
+                self.preps[lbl] = _state.convert(prep, to_type, self.basis, ideal, flatten_structure, cp_penalty=spam_cp_penalty)
         if any([c in categories_to_convert for c in ('all', 'povms')]):
-            for lbl, povm in self.povms.items():
-                if labels_to_convert == 'all' or lbl in labels_to_convert:
-                    ideal = ideal_model.povms.get(lbl, None) if (ideal_model is not None) else None
-                    self.povms[lbl] = _povm.convert (povm, to_type, self.basis, ideal, flatten_structure, cp_penalty=spam_cp_penalty)
+            povm_items = [(k,v) for (k,v) in self.povms.items() if k in roster]
+            for lbl, povm in povm_items:
+                ideal = ideal_model.povms[lbl]
+                self.povms[lbl] = _povm.convert(povm, to_type, self.basis, ideal, flatten_structure, cp_penalty=spam_cp_penalty)
 
         self._clean_paramvec()  # param indices were probabaly updated
         if set_default_gauge_group:
@@ -1746,3 +1797,30 @@ def operation_layer_operator(self, model, layerlbl, caches):
             return model.operations[layerlbl]
         else:
             return _opfactory.op_from_factories(model.factories, layerlbl)
+        
+    def get_dense_process_matrix_represention_for_gate(self, model: ExplicitOpModel, lbl: _LabelTup):
+        """
+        Get the dense process matrix corresponding to the lbl.
+        Note this should be the minimal size required to represent the dense operator.
+
+        Parameters
+        ----------
+        lbl: Label
+            A label with a gate name and a specific set of qubits it will be acting on.
+        
+        Returns
+        ----------
+        _np.ndarray
+        """
+
+        if lbl not in model.operations:
+            raise KeyError(f'Operation with lable {lbl} not found in model.operations.')
+
+        operation = model.operations[lbl]
+
+        if isinstance(operation, _EmbeddedOp):
+            return operation.embedded_op.to_dense()
+        elif isinstance(operation, _ComposedOp):
+            breakpoint()
+        return operation.to_dense('minimal')
+
diff --git a/pygsti/models/implicitmodel.py b/pygsti/models/implicitmodel.py
index 9ce554f7e..6c513778c 100644
--- a/pygsti/models/implicitmodel.py
+++ b/pygsti/models/implicitmodel.py
@@ -20,6 +20,7 @@
 from pygsti.modelmembers import povms as _povm
 from pygsti.modelmembers.modelmembergraph import ModelMemberGraph as _MMGraph
 from pygsti.baseobjs.label import Label as _Label
+from pygsti.models.memberdict import OrderedMemberDict as _OrderedMemberDict
 
 from pygsti.baseobjs.basis import Basis as _Basis
 from pygsti.baseobjs.statespace import StateSpace as _StateSpace
@@ -369,6 +370,51 @@ def _from_nice_serialization(cls, state):
             root_dicts[root_key][sub_key].update(mm_dict)  # Note: sub_keys should already be created
         return mdl
 
+    def create_explicit(self, composedops_as_views=False, spam_type='full TP', op_type='CPTPLND'):
+        from pygsti.models import ExplicitOpModel
+        if isinstance(composedops_as_views, str):
+            # Old positional argument behavior.
+            spam_type = composedops_as_views
+            op_type = composedops_as_views
+            composedops_as_views = False
+        
+        state = self.__getstate__()
+        state['povms'] = state['povm_blks']['layers']
+        state['preps'] = state['prep_blks']['layers']
+        opdict = _OrderedMemberDict(None, spam_type, None, dict())
+        opdict.update(state['_opcaches']['complete-layers'])
+        state['operations'] = opdict
+
+        for v in state['operations'].values():
+            v.unlink_parent(force=True)
+        for v in state['povms'].values():
+            v.unlink_parent(force=True)
+        for v in state['preps'].values():
+            v.unlink_parent(force=True)
+
+        eom = ExplicitOpModel(self.state_space)
+        eom.preps.update(state['preps'])
+        eom.povms.update(state['povms'])
+        eom.operations.update(state['operations'])
+
+        eom.convert_members_inplace(
+            to_type=spam_type, categories_to_convert=('preps', 'povms'),
+            allow_smaller_pp_basis=True, flatten_structure=True
+        )
+        eom.convert_members_inplace(
+            to_type=op_type, categories_to_convert=('operations',),
+            allow_smaller_pp_basis=True
+        )
+        if composedops_as_views:
+            from pygsti.modelmembers.operations import ComposedOp
+            allop_keys = eom.operations.keys()
+            for k in allop_keys:
+                curr_op = eom[k]
+                if isinstance(curr_op, ComposedOp) and all([subk in allop_keys for subk in k]):
+                    view_op = ComposedOp([eom[subk] for subk in k])
+                    eom[k] = view_op
+            eom._rebuild_paramvec()
+        return eom
 
 def _init_spam_layers(model, prep_layers, povm_layers):
     """ Helper function for initializing the .prep_blks and .povm_blks elements of an implicit model"""
diff --git a/pygsti/models/layerrules.py b/pygsti/models/layerrules.py
index d1fcd9357..d47cd21ab 100644
--- a/pygsti/models/layerrules.py
+++ b/pygsti/models/layerrules.py
@@ -13,7 +13,7 @@
 
 from pygsti.modelmembers import operations as _op
 from pygsti.baseobjs.nicelyserializable import NicelySerializable as _NicelySerializable
-
+from pygsti.baseobjs.label import LabelTup as _LabelTup
 
 class LayerRules(_NicelySerializable):
     """
@@ -110,3 +110,19 @@ def operation_layer_operator(self, model, layerlbl, cache):
         """
         #raise KeyError(f"Cannot create operator for non-primitive layer: {layerlbl}")
         raise KeyError("Cannot create operator for non-primitive layer: %s" % str(layerlbl))
+    
+    def get_dense_process_matrix_represention_for_gate(self, model, lbl: _LabelTup):
+        """
+        Get the dense process matrix corresponding to the lbl.
+        Note this should be the minimal size required to represent the dense operator.
+
+        Parameters
+        ----------
+        lbl: Label
+            A label with a gate name and a specific set of qubits it will be acting on.
+        
+        Returns
+        ----------
+        _np.ndarray
+        """
+        raise KeyError("Cannot find a dense operator for layer: %s" % str(lbl))
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index 739fb8f7d..9be357b23 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -37,6 +37,7 @@
 from pygsti.tools import optools as _ot
 from pygsti.tools import listtools as _lt
 from pygsti.processors.processorspec import ProcessorSpec as _ProcessorSpec, QubitProcessorSpec as _QubitProcessorSpec
+from pygsti.baseobjs.label import LabelTup as _LabelTup
 
 
 class LocalNoiseModel(_ImplicitOpModel):
@@ -171,7 +172,7 @@ def __init__(self, processor_spec, gatedict, prep_layers=None, povm_layers=None,
         idle_names = processor_spec.idle_gate_names
         global_idle_layer_label = processor_spec.global_idle_layer_label
 
-        layer_rules = _SimpleCompLayerRules(qudit_labels, implicit_idle_mode, None, global_idle_layer_label)
+        layer_rules = _SimpleCompLayerRules(qudit_labels, implicit_idle_mode, None, global_idle_layer_label, independent_gates=independent_gates)
 
         super(LocalNoiseModel, self).__init__(state_space, layer_rules, 'pp',
                                               simulator=simulator, evotype=evotype)
@@ -406,7 +407,7 @@ def rescale(coeffs):
 
 class _SimpleCompLayerRules(_LayerRules):
 
-    def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels, global_idle_layer_label):
+    def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels, global_idle_layer_label, independent_gates):
         super().__init__()
         self.implicit_idle_mode = implicit_idle_mode  # how to handle implied idles ("blanks") in circuits
         self.qubit_labels = qubit_labels
@@ -414,6 +415,7 @@ def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels,
         self._add_global_idle_to_all_layers = False
         self._add_padded_idle = False
         self.use_op_caching = True  # expert functionality - can be turned off if needed
+        self._spatial_homogeneity_assumed = not independent_gates
 
         if implicit_idle_mode not in ('none', 'add_global', 'only_global', 'pad_1Q'):
             raise ValueError("Invalid `implicit_idle_mode`: '%s'" % str(implicit_idle_mode))
@@ -612,3 +614,41 @@ def _layer_component_operation(self, model, complbl, cache):
         else:
             ret = _opfactory.op_from_factories(model.factories['layers'], complbl)
         return ret
+
+    def get_dense_process_matrix_represention_for_gate(self, model: _ImplicitOpModel, lbl: _LabelTup):
+        """
+        Get the dense process matrix corresponding to the lbl.
+        Note this should be the minimal size required to represent the dense operator.
+
+        Parameters
+        ----------
+        lbl: Label
+            A label with a gate name and a specific set of qubits it will be acting on.
+        
+        Returns
+        ----------
+        _np.ndarray
+        """
+
+        key = lbl.name if self._spatial_homogeneity_assumed else lbl
+        key_without_args = lbl.strip_args()
+
+        if key in model.operation_blks["gates"]:
+            return model.operation_blks["gates"][key].to_dense()
+        
+        elif key_without_args in model.factories['layers']:
+            ret = _opfactory.op_from_factories(model.factories['layers'], key)
+
+            return ret.embedded_op.to_dense()
+
+        elif self._add_padded_idle:
+            # We have idle gates that we can include.
+            absent_sslbls = lbl[1:]
+            new_key = self.single_qubit_idle_layer_labels[absent_sslbls]
+            if self._spatial_homogeneity_assumed:
+                new_key = new_key.name
+            return model.operation_blks["gates"][new_key].to_dense()
+    
+        else:
+            # Assume a perfect idle q-qubit gate.
+            return _np.eye(4**len(lbl.qubits))
diff --git a/pygsti/models/model.py b/pygsti/models/model.py
index 5f6f8e940..a6db8a942 100644
--- a/pygsti/models/model.py
+++ b/pygsti/models/model.py
@@ -662,7 +662,7 @@ def _iter_parameterized_objs(self):
         #return # default is to have no parameterized objects
 
     #TODO: Make this work with param interposers.
-    def _check_paramvec(self, debug=False):
+    def _check_paramvec(self, debug=True):
         if debug: print("---- Model._check_paramvec ----")
 
         ops_paramvec = self._model_paramvec_to_ops_paramvec(self._paramvec)
@@ -945,9 +945,7 @@ def uncollect_parameters(self, param_to_uncollect):
         self._rebuild_paramvec()
 
     def _rebuild_paramvec(self):
-        """ Resizes self._paramvec and updates gpindices & parent members as needed,
-            and will initialize new elements of _paramvec, but does NOT change
-            existing elements of _paramvec (use _update_paramvec for this)"""
+        """ Resizes self._paramvec and updates gpindices & parent members as needed."""
         w = self._model_paramvec_to_ops_paramvec(self._paramvec)
         Np = len(w)  # NOT self.num_params since the latter calls us!
         wl = self._paramlbls
@@ -1806,10 +1804,10 @@ def complete_circuits(self, circuits, prep_lbl_to_prepend=None, povm_lbl_to_appe
             already has a prep label this argument will be ignored.
 
         povm_lbl_to_append : Label, optional (default None)
-            Optional user specified prep label to prepend. If not
+            Optional user specified povm label to prepend. If not
             specified will use the default value as given by
             :meth:_default_primitive_prep_layer_lbl. If the circuit
-            already has a prep label this argument will be ignored.
+            already has a povm label this argument will be ignored.
         
         return_split : bool, optional (default False)
             If True we additionally return a list of tuples of the form:
@@ -1836,7 +1834,7 @@ def complete_circuits(self, circuits, prep_lbl_to_prepend=None, povm_lbl_to_appe
 
         #precompute unique default povm labels.
         unique_sslbls = set([ckt._line_labels for ckt in circuits])
-        default_povm_labels = {sslbls:(self._default_primitive_povm_layer_lbl(sslbls),) for sslbls in unique_sslbls}
+        default_povm_labels = {sslbls: (self._default_primitive_povm_layer_lbl(sslbls),) for sslbls in unique_sslbls}
 
         comp_circuits = []
         if return_split:
diff --git a/pygsti/models/modelconstruction.py b/pygsti/models/modelconstruction.py
index 42c4763d2..7b7c15fd3 100644
--- a/pygsti/models/modelconstruction.py
+++ b/pygsti/models/modelconstruction.py
@@ -37,6 +37,7 @@
 from pygsti.models import gaugegroup as _gg
 from pygsti.models.localnoisemodel import LocalNoiseModel as _LocalNoiseModel
 from pygsti.models.cloudnoisemodel import CloudNoiseModel as _CloudNoiseModel
+from pygsti.models.singlequbitequivalencerules import EquivalentClassesLocalNoiseModel as _EquivalentClassesLocalNoiseModel
 from pygsti.baseobjs import label as _label
 from pygsti.baseobjs import statespace as _statespace
 from pygsti.baseobjs.basis import Basis as _Basis
@@ -1525,6 +1526,52 @@ def _setup_local_gates(processor_spec, evotype, modelnoise=None, custom_gates=No
                 if (noiseop is not None) else ideal_factory
     return gatedict
 
+def _create_crosstalk_free_model_with_equivalent_clases(qudit_to_spatially_equivalent_qudit ,processor_spec, modelnoise, custom_gates=None, evotype="default", simulator="auto",
+                                 on_construction_error='raise', independent_gates=False, independent_spam=True,
+                                 ensure_composed_gates=False, ideal_gate_type='auto', ideal_prep_type='auto',
+                                 ideal_povm_type='auto', implicit_idle_mode='none', basis='pp') -> _EquivalentClassesLocalNoiseModel:
+    """
+    Create a n-qudit "crosstalk-free" model while assuming that certain qudits are spatially equivalent for 1 qudit gates.
+
+    Similar to :meth:`create_crosstalk_free_model` but the noise is input more generally,
+    as a :class:`ModelNoise` object.  Arguments are the same as this function except that
+    `modelnoise` is given instead of several more specific noise-describing arguments.
+
+    Returns
+    -------
+    EquivalentClassesLocalNoiseModel
+    """
+
+    qudit_labels = processor_spec.qudit_labels
+    state_space = _statespace.QubitSpace(qudit_labels) if all([udim == 2 for udim in processor_spec.qudit_udims]) \
+        else _statespace.QuditSpace(qudit_labels, processor_spec.qudit_udims)
+    evotype = _Evotype.cast(evotype, state_space=state_space)
+    modelnoise = _OpModelNoise.cast(modelnoise)
+    modelnoise.reset_access_counters()
+
+    if ideal_gate_type == "auto":
+        ideal_gate_type = ('static standard', 'static clifford', 'static unitary')
+    if ideal_prep_type == "auto":
+        ideal_prep_type = _state.state_type_from_op_type(ideal_gate_type)
+    if ideal_povm_type == "auto":
+        ideal_povm_type = _povm.povm_type_from_op_type(ideal_gate_type)
+
+    gatedict = _setup_local_gates(processor_spec, evotype, modelnoise, custom_gates, ideal_gate_type, basis)
+
+    # (Note: global idle is now handled through processor-spec processing)
+
+    # SPAM:
+    local_noise = True
+    prep_layers, povm_layers = _create_spam_layers(processor_spec, modelnoise, local_noise,
+                                                   ideal_prep_type, ideal_povm_type, evotype,
+                                                   state_space, independent_spam, basis)
+
+    modelnoise.warn_about_zero_counters()
+    return _EquivalentClassesLocalNoiseModel(qudit_to_spatially_equivalent_qudit, processor_spec, gatedict, prep_layers, povm_layers,
+                            evotype, simulator, on_construction_error,
+                            independent_gates, ensure_composed_gates,
+                            implicit_idle_mode)
+
 
 def create_crosstalk_free_model(processor_spec, custom_gates=None,
                                 depolarization_strengths=None, stochastic_error_probs=None, lindblad_error_coeffs=None,
@@ -1533,7 +1580,7 @@ def create_crosstalk_free_model(processor_spec, custom_gates=None,
                                 evotype="default", simulator="auto", on_construction_error='raise',
                                 independent_gates=False, independent_spam=True, ensure_composed_gates=False,
                                 ideal_gate_type='auto', ideal_spam_type='computational', implicit_idle_mode='none',
-                                basis='pp'):
+                                basis='pp', qudit_to_equivalent_qudit:dict[int, int] = None):
     """
     Create a n-qudit "crosstalk-free" model.
 
@@ -1679,6 +1726,12 @@ def create_crosstalk_free_model(processor_spec, custom_gates=None,
                                              depolarization_parameterization, stochastic_parameterization,
                                              lindblad_parameterization, allow_nonlocal=False)
 
+    if qudit_to_equivalent_qudit:
+        return _create_crosstalk_free_model_with_equivalent_clases(qudit_to_equivalent_qudit, processor_spec, modelnoise, custom_gates, evotype,
+                                                                   simulator, on_construction_error, independent_gates, independent_spam,
+                                                                   ensure_composed_gates, ideal_gate_type, ideal_spam_type, ideal_spam_type, implicit_idle_mode, basis)
+    
+
     return _create_crosstalk_free_model(processor_spec, modelnoise, custom_gates, evotype,
                                         simulator, on_construction_error, independent_gates, independent_spam,
                                         ensure_composed_gates, ideal_gate_type, ideal_spam_type, ideal_spam_type,
diff --git a/pygsti/models/singlequbitequivalencerules.py b/pygsti/models/singlequbitequivalencerules.py
new file mode 100644
index 000000000..065b60942
--- /dev/null
+++ b/pygsti/models/singlequbitequivalencerules.py
@@ -0,0 +1,115 @@
+from pygsti.models.localnoisemodel import _SimpleCompLayerRules, LocalNoiseModel as _LocalNoiseModel
+from pygsti.baseobjs.label import Label, LabelTup, LabelTupTup
+from pygsti.modelmembers.operations import opfactory as _opfactory
+
+
+
+class SingleQuditGateEquivalenceClassesLayerRules(_SimpleCompLayerRules):
+    """
+    Submodel which assumes that you have a set of qubits for which you trust the action of a single
+    qubit gate equally for all qubits within the set.
+    """
+
+    def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels, global_idle_layer_label):
+
+        super().__init__(qubit_labels, implicit_idle_mode, singleq_idle_layer_labels, global_idle_layer_label)
+
+    def operation_layer_operator(self, model, layerlbl: Label, caches):
+        """
+        Create the operator corresponding to `layerlbl`.
+
+        Parameters
+        ----------
+        layerlbl : Label
+            A circuit layer label.
+
+        Returns
+        -------
+        LinearOperator
+        """
+
+        if layerlbl in caches['complete-layers']:
+            return caches['complete-layers'][layerlbl]
+        
+        if isinstance(layerlbl, LabelTupTup):
+            # This could be a multiple qubit gate or multiple single qubit gates.
+
+            group = []
+            changed = False
+            
+            for op in layerlbl:
+                assert isinstance(op, LabelTup)
+                qubits_used = op.qubits
+                if op.num_qubits == 1:
+                    if model._qubits_to_equiv_qubit[qubits_used[0]] != qubits_used[0]:
+                        new_label = Label(op.name, model._qubits_to_equiv_qubit[qubits_used[0]], op.time, *op.args)
+
+                        changed = True
+                        group.append(new_label)
+                    else:
+                        group.append(op)
+                else:
+                    group.append(op)
+
+            if changed:
+                new_args = None if layerlbl.args == () else layerlbl.args
+                new_time = 0.0 if layerlbl.time == None else layerlbl.time
+                new_label = Label(group)
+            else:
+                new_label = layerlbl
+
+            # Get the operator
+            if new_label in caches['complete-layers']:
+                caches['complete-layers'][layerlbl] = caches['complete-layers'][new_label]
+                return caches['complete-layers'][new_label]
+            else:
+
+                answer = super().operation_layer_operator(model, new_label, caches)
+                caches['complete-layers'][new_label] = answer
+                caches['complete-layers'][layerlbl] = answer
+                return answer
+
+
+        elif isinstance(layerlbl, LabelTup):
+
+            qubits_used = layerlbl.qubits
+            if layerlbl.num_qubits == 1:
+                if model._qubits_to_equiv_qubit[qubits_used[0]] != qubits_used[0]:
+                    new_args = None if layerlbl.args == () else layerlbl.args
+                    new_time = 0.0 if layerlbl.time == None else layerlbl.time
+                    new_label = Label(layerlbl.name, model._qubits_to_equiv_qubit[qubits_used[0]], new_time, new_args)
+
+                    # Get the operator
+                    if new_label in caches['complete-layers']:
+                        caches['complete-layers'][layerlbl] = caches['complete-layers'][new_label]
+                        return caches['complete-layers'][new_label]
+                    else:
+
+                        answer = super().operation_layer_operator(model, new_label, caches)
+                        caches['complete-layer'][new_label] = answer
+                        caches['complete-layer'][layerlbl] = answer
+                        return answer
+
+        return super().operation_layer_operator(model, layerlbl, caches)
+
+
+class EquivalentClassesLocalNoiseModel(_LocalNoiseModel):
+
+    def __init__(self, qubit_to_equivalent_qubit_for_single_qgates: dict, processor_spec, gatedict, prep_layers=None, povm_layers=None, evotype="default",
+                simulator="auto", on_construction_error='raise',
+                independent_gates=False, ensure_composed_gates=False, implicit_idle_mode="none"):
+
+
+        super().__init__(processor_spec, gatedict, prep_layers, povm_layers, evotype, simulator,
+                         on_construction_error, independent_gates, ensure_composed_gates, implicit_idle_mode)
+        
+        # Now we need to reset the layer rules to use the Equivalent class rules.
+
+        old_rules = self._layer_rules
+
+        new_rules = SingleQuditGateEquivalenceClassesLayerRules( old_rules.qubit_labels, old_rules.implicit_idle_mode,
+                                                                old_rules.single_qubit_idle_layer_labels, old_rules.global_idle_layer_label)
+        
+        self._layer_rules = new_rules
+        self._qubits_to_equiv_qubit = qubit_to_equivalent_qubit_for_single_qgates
+        self._reinit_opcaches() # Clear the caches for using the new rules.
diff --git a/pygsti/optimize/customsolve.py b/pygsti/optimize/customsolve.py
index f03423400..e7552eca8 100644
--- a/pygsti/optimize/customsolve.py
+++ b/pygsti/optimize/customsolve.py
@@ -23,6 +23,9 @@
     _fastcalc = None
 
 
+CUSTOM_SOLVE_THRESHOLD = 10_000
+
+
 def custom_solve(a, b, x, ari, resource_alloc, proc_threshold=100):
     """
     Simple parallel Gaussian Elimination with pivoting.
@@ -95,7 +98,7 @@ def custom_solve(a, b, x, ari, resource_alloc, proc_threshold=100):
         return
 
     #Just gather everything to one processor and compute there:
-    if comm.size < proc_threshold and a.shape[1] < 10000:
+    if comm.size < proc_threshold and a.shape[1] < CUSTOM_SOLVE_THRESHOLD:
         # We're not exactly sure where scipy is better, but until we speed up / change gaussian-elim
         # alg the scipy alg is much faster for small numbers of procs and so should be used unless
         # A is too large to be gathered to the root proc.
@@ -163,9 +166,11 @@ def custom_solve(a, b, x, ari, resource_alloc, proc_threshold=100):
         # Step 1: find the index of the row that is the best pivot.
         # each proc looks for its best pivot (Note: it should not consider rows already pivoted on)
         potential_pivot_indices = all_row_indices[potential_pivot_mask]
-        ibest_global, ibest_local, h, k = _find_pivot(a, b, icol, potential_pivot_indices, my_row_slice,
-                                                      shared_floats, shared_ints, resource_alloc, comm, host_comm,
-                                                      smbuf1, smbuf2, smbuf3, host_index_buf, host_val_buf)
+        ibest_global, ibest_local, h, k = _find_pivot(
+            a, b, icol, potential_pivot_indices,
+            my_row_slice, shared_floats, shared_ints, resource_alloc,
+            comm, host_comm, smbuf1, smbuf1b, smbuf2,
+            smbuf3, host_index_buf, host_val_buf)
 
         # Step 2: proc that owns best row (holds that row and is root of param-fine comm) broadcasts it
         pivot_row, pivot_b = _broadcast_pivot_row(a, b, ibest_local, h, k, shared_rowb, local_pivot_rowb,
@@ -210,8 +215,12 @@ def custom_solve(a, b, x, ari, resource_alloc, proc_threshold=100):
     return
 
 
-def _find_pivot(a, b, icol, potential_pivot_inds, my_row_slice, shared_floats, shared_ints,
-                resource_alloc, comm, host_comm, buf1, buf1b, buf2, buf3, best_host_indices, best_host_vals):
+def _find_pivot(
+        a, b, icol, potential_pivot_inds,
+        my_row_slice, shared_floats, shared_ints, resource_alloc,
+        comm, host_comm, buf1, buf1b,
+        buf2, buf3, best_host_indices, best_host_vals
+    ):
     
     #print(f'Length potential_pivot_inds {len(potential_pivot_inds)}')
     #print(f'potential_pivot_inds: {potential_pivot_inds}')
diff --git a/pygsti/optimize/simplerlm.py b/pygsti/optimize/simplerlm.py
index a027bfd53..d6003151e 100644
--- a/pygsti/optimize/simplerlm.py
+++ b/pygsti/optimize/simplerlm.py
@@ -16,6 +16,7 @@
 
 import numpy as _np
 import scipy as _scipy
+from scipy import stats as _stats
 
 from pygsti.optimize import arraysinterface as _ari
 from pygsti.optimize.customsolve import custom_solve as _custom_solve
@@ -107,6 +108,7 @@ def cast(cls, obj):
             return cls(**obj) if obj else cls()
 
     def __init__(self):
+        self.tol = dict()
         super().__init__()
 
 
@@ -191,10 +193,10 @@ def __init__(self, maxiter=100, maxfev=100, tol=1e-6, fditer=0, first_fditer=0,
                  oob_action="reject", oob_check_mode=0, serial_solve_proc_threshold=100, lsvec_mode="normal"):
 
         super().__init__()
-        if isinstance(tol, float): tol = {'relx': 1e-8, 'relf': tol, 'f': 1.0, 'jac': tol, 'maxdx': 1.0}
+        if isinstance(tol, float): tol = {'relx': 1e-8, 'relf': tol, 'f': 1.0, 'jac': tol, 'maxdx': 1.0, 'chi2_icdf_tol': 0.0, 'critval_discount': 1.0}
         self.maxiter = maxiter
         self.maxfev = maxfev
-        self.tol = tol
+        self.tol.update(tol)
         self.fditer = fditer
         self.first_fditer = first_fditer
         self.init_munu = init_munu
@@ -283,6 +285,13 @@ def run(self, objective: TimeIndependentMDCObjectiveFunction, profiler, printer)
         else:
             ari = _ari.UndistributedArraysInterface(nEls, nP)
 
+        chi2_icdf_tol    = self.tol.get('chi2_icdf_tol', 1e-6)
+        critical_value   = _stats.chi2.ppf(chi2_icdf_tol, objective.num_data_params() - objective.model.num_params)
+        critval_discount = self.tol.get('critval_discount', 1.0)
+        def chi2_stopper(norm_f):
+            chi2val = objective.chi2k_distributed_qty(norm_f)
+            return chi2val < critval_discount * critical_value
+
         opt_x, converged, msg, mu, nu, norm_f, f = simplish_leastsq(
             objective_func, jacobian, x0,
             max_iter=self.maxiter,
@@ -291,6 +300,7 @@ def run(self, objective: TimeIndependentMDCObjectiveFunction, profiler, printer)
             jac_norm_tol=self.tol.get('jac', 1e-6),
             rel_ftol=self.tol.get('relf', 1e-6),
             rel_xtol=self.tol.get('relx', 1e-8),
+            chi2_stopper=chi2_stopper,
             max_dx_scale=self.tol.get('maxdx', 1.0),
             init_munu=self.init_munu,
             oob_check_interval=self.oob_check_interval,
@@ -300,7 +310,8 @@ def run(self, objective: TimeIndependentMDCObjectiveFunction, profiler, printer)
             arrays_interface=ari,
             serial_solve_proc_threshold=self.serial_solve_proc_threshold,
             x_limits=x_limits,
-            verbosity=printer - 1, profiler=profiler)
+            verbosity=printer - 1, profiler=profiler,
+            )
 
         printer.log("Least squares message = %s" % msg, 2)
         assert(converged), "Failed to converge: %s" % msg
@@ -372,7 +383,7 @@ def jac_guarded(k: int, num_fd_iters: int, obj_fn: Callable, jac_fn: Callable, f
 
 def simplish_leastsq(
     obj_fn, jac_fn, x0, f_norm2_tol=1e-6, jac_norm_tol=1e-6,
-    rel_ftol=1e-6, rel_xtol=1e-6, max_iter=100, num_fd_iters=0, max_dx_scale=1.0,
+    rel_ftol=1e-6, rel_xtol=1e-6, chi2_stopper=None, max_iter=100, num_fd_iters=0, max_dx_scale=1.0,
     init_munu="auto", oob_check_interval=0, oob_action="reject", oob_check_mode=0,
     resource_alloc=None, arrays_interface=None, serial_solve_proc_threshold=100,
     x_limits=None, verbosity=0, profiler=None
@@ -414,6 +425,10 @@ def simplish_leastsq(
         Tolerance on the relative value of `|x|`, so that if
         `d(|x|)/|x| < rel_xtol` then mark converged.
 
+    chi2_stopper : Callable[float] -> float, optional
+        Terminate if chi2_stopper( norm( sum(obj_fn(x)**2) ) ) is True.
+        If None, then we set to a lambda function that always returns False.
+
     max_iter : int, optional
         The maximum number of (outer) interations.
 
@@ -518,10 +533,17 @@ def simplish_leastsq(
     max_norm_dx = (max_dx_scale**2) * len(global_x) if max_dx_scale else None
     # ^ don't let any component change by more than ~max_dx_scale
 
+    if chi2_stopper is None:
+        chi2_stopper = lambda _: False
 
     f = obj_fn(global_x)  # 'E'-type array
     norm_f = ari.norm2_f(f)
     if not _np.isfinite(norm_f):
+        # TODO: this path can be hit when f contains NaNs. We should
+        # really have a separate error message for that. Performing
+        # the check will require updating our ArraysInterface API
+        # (which isn't hard but is beside the point as I'm writing this)
+        #   -Riley.
         msg = "Infinite norm of objective function at initial point!"
 
     if len(global_x) == 0:  # a model with 0 parameters - nothing to optimize
@@ -543,6 +565,18 @@ def simplish_leastsq(
             if len(msg) > 0:
                 break  # exit outer loop if an exit-message has been set
 
+            if chi2_stopper(norm_f):
+                if oob_check_interval <= 1:
+                    msg = "Critical value for chi2 statistic acheived."
+                    converged = True
+                    break
+                else:
+                    printer.log(("** Converged with out-of-bounds with check interval=%d, reverting to last know in-bounds point and setting interval=1 **") % oob_check_interval, 2)
+                    oob_check_interval = 1
+                    x[:] = best_x[:]
+                    mu, nu, norm_f, f[:] = best_x_state
+                    continue   
+
             if norm_f < f_norm2_tol:
                 if oob_check_interval <= 1:
                     msg = "Sum of squares is at most %g" % f_norm2_tol
diff --git a/pygsti/processors/processorspec.py b/pygsti/processors/processorspec.py
index 514dd2cd8..ba8a59013 100644
--- a/pygsti/processors/processorspec.py
+++ b/pygsti/processors/processorspec.py
@@ -1322,7 +1322,7 @@ def compute_clifford_ops_on_qubits(self):
 
         return clifford_ops_on_qubits
 
-        ### TODO: do we still need this?
+    ### TODO: do we still need this?
     @lru_cache(maxsize=100)
     def compute_clifford_2Q_connectivity(self):
         """
@@ -1362,8 +1362,17 @@ def compute_2Q_connectivity(self):
         qubit_labels = self.qubit_labels
         for gn in self.gate_names:
             if self.gate_num_qubits(gn) == 2:
-                for sslbls in self.resolved_availability(gn, 'tuple'):
-                    twoQ_connectivity[qubit_labels.index(sslbls[0]), qubit_labels.index(sslbls[1])] = True
+                avail = self.resolved_availability(gn, 'tuple')
+                if len(avail) == 1 and avail[0] is None and gn == '{idle}':
+                    avail = [qubit_labels]
+                    # if qubit_labels.size == 2:
+                    #     avail = [qubit_labels]
+                    # else:
+                    #     raise ValueError('Availability of the idle gate has not been set.')
+                for sslbls in avail:
+                    i = qubit_labels[sslbls[0]]
+                    j = qubit_labels[sslbls[1]]
+                    twoQ_connectivity[i, j] = True
 
         return _qgraph.QubitGraph(qubit_labels, twoQ_connectivity)
     
diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
new file mode 100644
index 000000000..0e94d1661
--- /dev/null
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -0,0 +1,484 @@
+import numpy as np
+from pygsti.protocols import CircuitListsDesign, HasProcessorSpec
+from pygsti.circuits.circuitlist import CircuitList
+from pygsti.circuits.circuit import Circuit
+from pygsti.circuits.split_circuits_into_lanes import batch_tensor
+from pygsti.baseobjs.label import Label
+import copy
+
+
+def find_neighbors(vertices: list, edges: list) -> dict:
+    """
+    Find the neighbors of each vertex in a graph.
+
+    This function takes a list of vertices and a dictionary of edges, 
+    where edges are represented as tuples. It returns a dictionary 
+    mapping each vertex to a list of its neighbors.
+
+    Parameters:
+    vertices (list): A list of vertices in the graph.
+    edges (dict): A symmetric list of the edges in the graph [e.g., (v1, v2) and (v2, v1) are elements].
+
+    Returns:
+    dict: A dictionary where each key is a vertex and the value is a 
+          list of neighboring vertices.
+    """
+    neighbors = {v: [] for v in vertices}
+    for e in edges:
+        neighbors[e[0]].append(e[1])
+    return neighbors
+
+
+def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list, 
+                                       oneq_gstdesign, twoq_gstdesign, randstate: int) -> tuple:
+    """
+    Generate crosstalk-free GST circuits by stitching together 1Q and 2Q GST circuits for 
+    each color patch.
+
+    This function combines 1Q and 2Q GST circuits based on the specified color patches.
+    For each germ power L, it randomizes the order of the 2Q GST circuits and the 1Q GST 
+    circuits for each edge and unused qubit. The circuits are then stitched together to 
+    form the final circuit lists.
+
+    Parameters:
+    color_patches (dict): A dictionary mapping color patches to their corresponding edge sets.
+                          A 'color patch' is a set of similarly colored edges in an edge coloring.
+    vertices (list): A list of vertices in the graph.
+    oneq_gstdesign: A GST edesign containing the 1Q GST circuits.
+    twoq_gstdesign: An GST edesign containing the 2Q GST circuit.
+    randstate: A random state object used for randomization.
+
+    Returns:
+    tuple: A tuple containing:
+        - circuit_lists (list): A list of crosstalk-free GST circuits for each germ power.
+        - aux_info (dict): Auxiliary information mapping circuits to their corresponding edges and vertices.
+    """
+
+    circuit_lists = [[] for _ in twoq_gstdesign.circuit_lists]
+    twoq_idle_label = Label(('Gii',) + twoq_gstdesign.qubit_labels)
+    oneq_idle_label = Label(('Gi',) + oneq_gstdesign.qubit_labels)
+    mapper_2q: dict[Label, Label] = {twoq_idle_label: twoq_idle_label}
+    mapper_1q: dict[Label, Label] = {oneq_idle_label: oneq_idle_label}
+    for cl in twoq_gstdesign.circuit_lists:
+        for c in cl:
+            mapper_2q.update({k:k for k in c._labels})
+            mapper_2q[Label(())] = twoq_idle_label
+    for cl in oneq_gstdesign.circuit_lists:
+        for c in cl:
+            mapper_1q.update({k:k for k in c._labels})
+            mapper_1q[Label(())] = oneq_idle_label
+    assert Label(()) not in mapper_2q.values()
+    assert Label(()) not in mapper_1q.values()
+    aux_info = {}
+
+    m2q = mapper_2q.copy()
+    for k2 in mapper_2q:
+        if k2.num_qubits == 1:
+            tgt = k2[1]
+            tmp = [None, None]
+            tmp[tgt] = k2
+            tmp[1-tgt] = Label("Gi", 1-tgt)
+            m2q[k2] = tuple(tmp)
+
+    mapper_2q = m2q # Reset here.
+    layer_mappers = {1: mapper_1q, 2: mapper_2q}
+
+    num_lines = -1
+    global_line_order = None
+    for patch, edge_set in color_patches.items():
+        # This might be broken when edge_set is empty.
+        used_qubits = np.array(edge_set).ravel()
+        unused_qubits = np.setdiff1d(np.array(vertices), used_qubits)
+        assert len(oneq_gstdesign.circuit_lists) == len(twoq_gstdesign.circuit_lists), "Not implemented."
+
+        for L, (oneq_circuits, twoq_circuits) in enumerate(zip(oneq_gstdesign.circuit_lists, twoq_gstdesign.circuit_lists)):   # assumes that they use the same L 
+            oneq_len = len(oneq_circuits)
+            twoq_len = len(twoq_circuits)
+
+            max_len = max(oneq_len, twoq_len)
+            min_len = min(oneq_len, twoq_len)
+            num_batches = int(np.ceil(max_len / min_len))
+
+            if oneq_len > twoq_len:
+                # node_perms = [randstate.permutation(max_len) for _ in unused_qubits]
+                # edge_perms = [[] for _ in edge_set]
+                # for _ in range(num_batches):
+                #     for perm in edge_perms:
+                #         perm.extend([randstate.permutation(min_len)])
+                # edge_perms = [mp[:max_len] for mp in edge_perms]
+                raise NotImplementedError()
+        
+            # 2Q GST circuit list is longer
+            edge_perms = [randstate.permutation(max_len) for _ in edge_set] # Randomize the order in which we place 2Q GST circuits on each edge
+            node_perms = []
+            for i in range(unused_qubits.size):
+                perms = [randstate.permutation(min_len) for _ in range(num_batches)]
+                node_perms.append(np.concatenate(perms)[:max_len])
+            edge_perms = np.array(edge_perms)
+            node_perms = np.array(node_perms)
+        
+            # Check invariants
+            edge_line_contributions = 2*edge_perms.shape[0] if edge_perms.size > 0 else 0
+            node_line_contributions =   node_perms.shape[0] if node_perms.size > 0 else 0
+            curr_num_lines = edge_line_contributions + node_line_contributions
+            if num_lines < 0:
+                num_lines = curr_num_lines
+                global_line_order = tuple(range(num_lines))
+
+            assert num_lines == curr_num_lines
+            if edge_perms.size > 0 and node_perms.size > 0:
+                assert edge_perms.shape[1] == node_perms.shape[1]
+            
+            # Form the tensor product circuits, over all qubits.
+            for j in range(max_len):
+                tensored_lines  = []
+                circs_to_tensor = []
+                if len(edge_perms):
+                    edge_start = 1
+                    node_start = 0
+                    c = twoq_circuits[edge_perms[0,j]]
+                    tensored_lines.append(edge_set[0])
+                else:
+                    edge_start = 0
+                    node_start = 1
+                    c = oneq_circuits[node_perms[0,j]]
+                    tensored_lines.append((unused_qubits[0],))
+                circs_to_tensor.append(c)
+                for i in range(edge_start, edge_perms.shape[0]):
+                    c2 = twoq_circuits[ edge_perms[i,j] ]
+                    circs_to_tensor.append( c2 )
+                    tensored_lines.append(edge_set[i])
+                for i in range(node_start, node_perms.shape[0]):
+                    c2 = oneq_circuits[ node_perms[i,j] ]
+                    circs_to_tensor.append( c2 )
+                    tensored_lines.append((unused_qubits[i],))
+                c_ten = batch_tensor(circs_to_tensor, layer_mappers, global_line_order, tensored_lines)
+                for i in range(c_ten.num_layers):
+                    l0 = set(c_ten.layer(i))
+                    l1 = set(c_ten.layer_with_idles(i))
+                    assert l0 == l1
+                circuit_lists[L].append(c_ten)
+                aux_info[c_ten] = {'edges': edge_set, 'vertices': unused_qubits} #YOLO
+
+    return circuit_lists, aux_info
+
+
+class CrosstalkFreeExperimentDesign(CircuitListsDesign):
+    """
+    This class initializes a crosstalk-free GST experiment design by combining 
+    1Q and 2Q GST designs based on a specified edge coloring. It assumes that 
+    the GST designs share the same germ powers (Ls) and utilizes a specified 
+    circuit stitcher to generate the final circuit lists.
+
+    Attributes:
+    processor_spec: Specification of the processor, including qubit labels and connectivity.
+    oneq_gstdesign: The design for one-qubit GST circuits.
+    twoq_gstdesign: The design for two-qubit GST circuits.
+    edge_coloring (dict): A dictionary mapping color patches to their corresponding edge sets.
+    circuit_stitcher (callable): A function to stitch circuits together (default: stitch_circuits_by_germ_power_only).
+    seed (int, optional): Seed for random number generation.
+
+    circuit_lists (list): The generated list of stitched circuits.
+    aux_info (dict): Auxiliary information mapping circuits to their corresponding edges and vertices.
+    """
+    def __init__(self, processor_spec, oneq_gstdesign, twoq_gstdesign, edge_coloring, 
+                 circuit_stitcher = stitch_circuits_by_germ_power_only, seed = None):
+        """
+        Assume that the GST designs have the same Ls.
+
+        TODO: Update the init function so that it handles different circuit stitchers better (i.e., by using stitcher_kwargs, etc.)
+        """
+        # TODO: make sure idle gates are explicit.
+        randstate = np.random.RandomState(seed)
+        self.processor_spec = processor_spec
+        self.oneq_gstdesign = oneq_gstdesign
+        self.twoq_gstdesign = twoq_gstdesign
+        self.vertices = self.processor_spec.qubit_labels
+        self.edges = self.processor_spec.compute_2Q_connectivity().edges()
+        self.neighbors = find_neighbors(self.vertices, self.edges)
+        self.deg = max([len(self.neighbors[v]) for v in self.vertices])
+        self.color_patches = edge_coloring
+        self.circuit_stitcher = circuit_stitcher
+    
+        self.circuit_lists, self.aux_info = circuit_stitcher(self.color_patches, self.vertices, 
+                                                            self.oneq_gstdesign, self.twoq_gstdesign, 
+                                                            randstate,
+        )
+        
+        CircuitListsDesign.__init__(self, self.circuit_lists, qubit_labels=self.vertices)
+
+
+"""
+Everything below is used to find an edge coloring of a graph.
+"""
+
+def order(u, v):
+    """
+    Return a tuple containing the two input values in sorted order.
+
+    This function takes two values, `u` and `v`, and returns them as a 
+    tuple in ascending order. The smaller value will be the first element 
+    of the tuple.
+
+    Parameters:
+    u: The first value to be ordered.
+    v: The second value to be ordered.
+
+    Returns:
+    tuple: A tuple containing the values `u` and `v` in sorted order.
+    """
+    return (min(u, v), max(u, v))
+
+
+def find_fan_candidates(fan: list, u: int, vertices: list, edge_colors: dict, free_colors: dict) -> list:
+    """
+    Selects candidate vertices to be added to a fan.
+
+    This function returns vertices connected to the anchor vertex `u` 
+    where the edge (u, v) is colored with a color that is free on the 
+    last vertex in the fan.
+
+    Parameters:
+    fan (list): A list of vertices representing the current fan.
+    u (int): The anchor vertex of the fan.
+    vertices (list): A list of all vertices in the graph.
+    edge_colors (dict): A dictionary mapping edges to their current colors.
+    free_colors (dict): A dictionary mapping vertices to their available colors.
+
+    Returns:
+    list: A list of candidate vertices that can be colored with free colors 
+          available to the last vertex in the fan.
+    """
+    last_vertex = fan[-1]
+    free_vertex_colors = free_colors[last_vertex]
+    return [v for v in vertices if edge_colors[(u, v)] in free_vertex_colors]
+
+
+def build_maximal_fan(u: int, v: int, vertex_neighbors: dict, 
+                      free_colors: dict, edge_colors: dict) -> list:
+    """
+    Construct a maximal fan of vertex u starting with vertex v.
+
+    A fan is a sequence of distinct neighbors of u that satisfies the following:
+    1. The edge (u, v) is uncolored.
+    2. Each subsequent edge (u, F[i+1]) is free on F[i] for 1 <= i < k.
+
+    Parameters:
+    u (int): The vertex from which the fan is built.
+    v (int): The first vertex in the fan.
+    vertex_neighbors (dict): A dictionary mapping vertices to their neighbors.
+    free_colors (dict): A dictionary mapping vertices to their available colors.
+    edge_colors (dict): A dictionary mapping edges to their current colors.
+
+    Returns:
+    list: A list representing the maximal fan of vertex u.
+    """
+    u_neighbors = copy.deepcopy(vertex_neighbors[u])
+    fan = [v]
+    u_neighbors.remove(v)
+
+    candidate_vertices = find_fan_candidates(fan, u, u_neighbors, edge_colors, free_colors)
+    while len(candidate_vertices) != 0:
+        fan.append(candidate_vertices[0])
+        u_neighbors.remove(candidate_vertices[0])
+        candidate_vertices = find_fan_candidates(fan, u, u_neighbors, edge_colors, free_colors)
+    return fan
+
+
+def find_next_path_vertex(current_vertex: int, color: int, neighbors: dict, edge_colors: dict):
+    """
+    Finds, if it exists, the next vertex in a cd_u path. It does so by finding the neighbor 
+    of the current vertex which is attached by an edge of the right color.
+
+    Parameters:
+    current_vertex (int): The last vertext added to the cd_u path.
+    color (int): The desired color of the next possible edge in the cd_u path.
+    neighbors (dict): A dictionary mapping each vertex to its neighboring vertices.
+    edge_colors (dict): A dictionary mapping edges to their curren colors.
+
+    Returns: 
+    int or None: The next vertex in the cd_u path that is connected by an edge of the specified color, 
+                 or None if no such vertex exists.
+    """
+    
+    for vertex in neighbors[current_vertex]:
+        if edge_colors[(current_vertex, vertex)] == color:
+            return vertex
+    return None
+    
+
+def find_color_path(u: int, v: int, c: int, d: int, neighbors: dict, edge_colors: dict) -> list:
+    """
+    Finds the cd_u path.
+
+    The cd_u path is a path passing through u of edges whose colors alternate between c and d.
+    Every cd_u path in the Misra & Gries algorithm starts at u with an edge of color 'd', 
+    because 'c' was chosen to be free on u. Assuming that a cd_u path exists.
+
+    Parameters:
+    u (int): The starting vertex of the path.
+    v (int): The target vertex (not used in path finding).
+    c (int): The color that is free on vertex `u`.
+    d (int): The color that is initially used for the first edge from `u`.
+    neighbors (dict): A dictionary mapping each vertex to its neighboring vertices.
+    edge_colors (dict): A dictionary mapping edges to their current colors.
+
+    Returns:
+    list: A list of tuples representing the edges in the cd_u path.
+    """
+    cdu_path = []
+    current_color = d
+    current_vertex = u
+    next_vertex = find_next_path_vertex(u, d, neighbors, edge_colors)
+    next_color = {c: d, d: c}
+
+    while next_vertex is not None:
+        cdu_path.append((current_vertex, next_vertex))
+        current_vertex = next_vertex
+        current_color = next_color[current_color]
+        next_vertex = find_next_path_vertex(current_vertex, current_color, neighbors, edge_colors)
+    return cdu_path
+
+
+def rotate_fan(fan: list, u: int, edge_colors: dict, free_colors: dict, color_patches: dict):
+    """
+    Rotate the colors in a fan of vertices connected to a specified vertex.
+
+    This function shifts the colors in the fan over by one position, updating the 
+    edge colorings, free colors for each vertex, and the associated color patches. 
+    After rotation, the edge connected to the specified vertex `u` and the first 
+    vertex in the fan receives the color of the next vertex in the fan, while the 
+    color of the last vertex in the fan is removed.
+
+    Parameters:
+    fan (list): A list of vertices representing the fan to be rotated.
+    u (int): The vertex anchoring the fan that is being rotated.
+    edge_colors (dict): A dictionary mapping edges to their current colors.
+    free_colors (dict): A dictionary mapping vertices to their available colors.
+    color_patches (dict): A dictionary mapping colors to lists of edges colored with that color.
+
+    Returns:
+    tuple: Updated dictionaries for edge_colors, free_colors, and color_patches after rotation.
+    """
+    for i in range(len(fan) - 1):
+        curr_vertex = fan[i]
+        next_vertex = fan[i+1]
+        next_color = edge_colors[(u, next_vertex)]
+        
+        edge_colors[(u, curr_vertex)] = next_color
+        edge_colors[(curr_vertex, u)] = next_color
+        edge_colors[(u, next_vertex)] = -1
+        edge_colors[(next_vertex, u)] = -1
+
+        free_colors[curr_vertex].remove(next_color)
+        free_colors[next_vertex].append(next_color)
+
+        color_patches[next_color].append(order(u, curr_vertex))
+        color_patches[next_color].remove(order(u, next_vertex))
+
+    return edge_colors, free_colors, color_patches
+
+
+def check_valid_edge_coloring(color_patches):
+    """
+    color_patches (dict): A dictionary mapping each color to a list of edges colored with that color.
+                          Unlike with edges, the items in color_patches are NOT symmetric [i.e., it only contains (v1, v2) for v1 < v2]
+    """
+    for c, patch in color_patches.items():
+        in_patch = set()
+        for pair in patch:
+            in_patch.add(pair[0])
+            in_patch.add(pair[1])
+        if len(in_patch) != 2*len(patch):
+            raise ValueError()
+    return
+
+
+def find_edge_coloring(deg: int, vertices: list, edges: list, neighbors: dict) -> dict:
+    """
+    Implements Misra & Gries' edge coloring algorithm for a simple undirected graph.
+
+    This function colors the edges of a simple undirected graph using at most 
+    d or d+1 colors, where d is the maximum degree of the graph. The algorithm 
+    is optimal (or off by 1) for all simple, undirected graphs, as stated by 
+    Vizing's theorem.
+
+    Parameters:
+    deg (int): The maximum degree of the graph.
+    vertices (list): A list of vertices in the graph.
+    edges (list): A list of edges represented as tuples of vertices [assumed to be symmetric, i.e., (u,v) and (v,u) are elements].
+    neighbors (dict): A dictionary mapping each vertex to its neighboring vertices.
+
+    Returns:
+    color_patches (dict): A dictionary mapping each color to a list of edges colored with that color.
+                          Unlike with edges, the items in color_patches are NOT symmetric [i.e., it only contains (v1, v2) for v1 < v2]
+    """
+
+    edges = copy.deepcopy(edges)
+    free_colors = {u: [i for i in range(deg+1)] for u in vertices} # Keeps track of which colors are free on each vertex
+    color_patches = {i: [] for i in range(deg+1)} # Keeps track of the edges (items) that have been assigned a color (keys)
+    
+    edge_colors = {edge: -1 for edge in edges} # Keeps track of which color (item) an edge has been assigned to (key)
+    edges = [list(edge) for edge in edges]
+
+    for edge in edges:
+        edge.sort()
+    edges = list(set([tuple(edge) for edge in edges]))
+
+    # Loop the edges in G.
+    # You will color a new edge each time.
+    for edge in edges:
+        # Find a maximal fan F of vertex 'u' with F[1] = 'v.'
+        u, v = edge
+        max_fan = build_maximal_fan(u, v, neighbors, free_colors, edge_colors)
+        
+        # Pick free colors c and d on u and k, the last vertex in the fan.
+        # Find the cd_u path, i.e., the maximal path through u of edges whose colors alternate between c and d.
+        k = max_fan[-1]
+        c, d = free_colors[u][-1], free_colors[k][-1] # c is free on u, while d is free on the last entry in the fan
+        cdu_path = find_color_path(u, k, c, d, neighbors, edge_colors)        
+        
+        # invert the cd_u path
+        for i in range(len(cdu_path)):
+            path_edge = cdu_path[i]
+            # path should be colored as d, c, d, c, etc... because c was free on u
+            current_color = [d, c][i%2]
+            other_color = [d, c][(i+1)%2]
+            if order(path_edge[0], path_edge[1]) in color_patches[current_color]:
+                color_patches[current_color].remove(order(path_edge[0], path_edge[1]))
+                #color_patches[current_color].remove((path_edge[1], path_edge[0]))
+            color_patches[other_color].append(order(path_edge[0], path_edge[1]))
+            #color_patches[other_color].append((path_edge[1], path_edge[0]))
+            edge_colors[path_edge] = other_color
+            edge_colors[(path_edge[1], path_edge[0])] = other_color
+        if len(cdu_path) > 0: 
+            free_colors[u].remove(c)
+            free_colors[u].append(d)
+            final_color, final_vertex = edge_colors[cdu_path[-1]], cdu_path[-1][-1]
+            free_colors[final_vertex].remove(final_color)
+            free_colors[final_vertex].append(list(np.setdiff1d([c, d], [final_color]))[0])        
+        
+        # Find a subfan of u, F' = F[1:w] for which the color d is free on w.
+        w_index = 0
+        for i in range(len(max_fan)):
+            if d in free_colors[max_fan[i]]: w_index = i
+        w, sub_fan = max_fan[w_index], max_fan[:w_index + 1]
+        
+        # Rotate the subfan. If it exists, then
+        # you have now colored the edge (u,v) with whatever color was on (u, F[2])
+        if len(sub_fan) > 1: # rotate the subfan
+            edge_colors, free_colors, color_patches = rotate_fan(sub_fan, u, edge_colors, free_colors, color_patches)
+
+        # Set the color of (u, w) to d.
+        edge_colors[(u, w)] = d
+        edge_colors[(w, u)] = d
+        color_patches[d].append(order(u, w))
+        if d in free_colors[u]:
+            free_colors[u].remove(d)
+        if d in free_colors[w]:
+            free_colors[w].remove(d)
+
+        check_valid_edge_coloring(color_patches)
+
+    return color_patches
+        
\ No newline at end of file
diff --git a/pygsti/report/factory.py b/pygsti/report/factory.py
index af52d5e60..8be472401 100644
--- a/pygsti/report/factory.py
+++ b/pygsti/report/factory.py
@@ -184,8 +184,7 @@ def _create_master_switchboard(ws, results_dict, confidence_level,
     Ls = None
 
     for results in results_dict.values():
-        est_labels = _add_new_estimate_labels(est_labels, results.estimates,
-                                              combine_robust)
+        est_labels = _add_new_estimate_labels(est_labels, results.estimates, combine_robust)
         loc_Ls = results.circuit_lists['final'].xs \
             if isinstance(results.circuit_lists['final'], _PlaquetteGridCircuitStructure) else [0]
         Ls = _add_new_labels(Ls, loc_Ls)
@@ -311,10 +310,8 @@ def _create_master_switchboard(ws, results_dict, confidence_level,
             else:
                 est_modvi = est
 
-            switchBd.objfn_builder[d, i] = est.parameters.get(
-                'final_objfn_builder', _objfns.ObjectiveFunctionBuilder.create_from('logl'))
-            switchBd.objfn_builder_modvi[d, i] = est_modvi.parameters.get(
-                'final_objfn_builder', _objfns.ObjectiveFunctionBuilder.create_from('logl'))
+            switchBd.objfn_builder[d, i] = est.parameters.get('final_objfn_builder', _objfns.ObjectiveFunctionBuilder.create_from('logl'))
+            switchBd.objfn_builder_modvi[d, i] = _objfns.ObjectiveFunctionBuilder.create_from('logl')
             switchBd.params[d, i] = est.parameters
             
             #add the final mdc store
diff --git a/pygsti/tools/dyadickronop.py b/pygsti/tools/dyadickronop.py
new file mode 100644
index 000000000..687d22c76
--- /dev/null
+++ b/pygsti/tools/dyadickronop.py
@@ -0,0 +1,132 @@
+"""
+Tools for working with Kronecker Products especially the Dyadic forms.
+"""
+#***************************************************************************************************
+# Copyright 2015, 2019, 2025 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
+# in this software.
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.  You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
+#***************************************************************************************************
+
+
+from __future__ import annotations
+import numpy as np
+import scipy.sparse.linalg as sparla
+
+class RealLinOp:
+    
+    # Function implementations below are merely defaults.
+    # Don't hesitate to override them if need be.
+
+    __array_priority__ = 100
+
+    @property
+    def ndim(self):
+        return 2
+
+    @property
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self._shape
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def T(self):
+        return self._adjoint
+
+    def item(self):
+        # If self.size == 1, return a scalar representation of this linear operator.
+        # Otherwise, error.
+        raise NotImplementedError()
+
+    def __matmul__(self, other):
+        return self._linop @ other
+    
+    def __rmatmul__(self, other):
+        return other @ self._linop
+
+
+class DyadicKronStructed(RealLinOp):
+
+    def __init__(self, A, B, adjoint=None):
+        assert A.ndim == 2
+        assert B.ndim == 2
+        self.A = A
+        self.B = B
+        self._A_is_trivial = A.size == 1
+        self._B_is_trivial = B.size == 1
+        self._shape = ( A.shape[0]*B.shape[0], A.shape[1]*B.shape[1] )
+        self._size = self.shape[0] * self.shape[1]
+        self._fwd_matvec_core_shape = (B.shape[1], A.shape[1])
+        self._adj_matvec_core_shape = (B.shape[0], A.shape[0])
+        self._dtype = A.dtype
+        self._linop =  sparla.LinearOperator(dtype=self.dtype, shape=self.shape, matvec=self.matvec, rmatvec=self.rmatvec)
+        self._adjoint = DyadicKronStructed(A.T, B.T, adjoint=self) if adjoint is None else adjoint
+
+    def item(self):
+        # This will raise a ValueError if self.size > 1.
+        return self.A.item() * self.B.item()
+    
+    def matvec(self, other):
+        inshape = other.shape
+        assert other.size == self.shape[1]
+        if self._A_is_trivial:
+            return self.A.item() * (self.B @ other)
+        if self._B_is_trivial:
+            return self.B.item() * (self.A @ other)
+        out = self.B @ np.reshape(other, self._fwd_matvec_core_shape, order='F') @ self.A.T
+        out = np.reshape(out, inshape, order='F')
+        return out
+
+    def rmatvec(self, other):
+        inshape = other.shape
+        assert other.size == self.shape[0]
+        if self._A_is_trivial:
+            return self.A.item() * (self.B.T @ other)
+        if self._B_is_trivial:
+            return self.B.item() * (self.A.T @ other)
+        out = self.B.T @ np.reshape(other, self._adj_matvec_core_shape, order='F') @ self.A
+        out = np.reshape(out, inshape, order='F')
+        return out
+    
+    @staticmethod
+    def build_polyadic(kron_operands) -> DyadicKronStructed:
+        if len(kron_operands) == 2:
+            out = DyadicKronStructed(kron_operands[0], kron_operands[1])
+            return out
+        # else, recurse
+        arg = DyadicKronStructed.build_polyadic(kron_operands[1:])
+        out = DyadicKronStructed(kron_operands[0], arg)
+        return out
+
+
+class KronStructured(RealLinOp):
+
+    def __init__(self, kron_operands):
+        self.kron_operands = kron_operands
+        # assert all([op.ndim == 2 for op in kron_operands])
+        self.shapes = np.array([op.shape for op in kron_operands])
+        self._shape = tuple(int(i) for i in np.prod(self.shapes, axis=0))
+        self.dyadic_struct = DyadicKronStructed.build_polyadic(self.kron_operands)
+        self._linop   = self.dyadic_struct._linop
+        self._adjoint = self.dyadic_struct.T
+        self._dtype = self.kron_operands[0].dtype
+
+    def to_full_array(self) -> np.ndarray:
+        """
+        Return the full dense matrix. Do not use this method in a performance sensitive routine
+        as you will not be utilizing the structure of the matrix to its full
+        potential. This is mainly used as a debugging tool.
+        """
+        output = 1
+        for i in range(len(self.kron_operands)):
+            output = np.kron(output, self.kron_operands[i])
+        return output
diff --git a/pygsti/tools/matrixtools.py b/pygsti/tools/matrixtools.py
index c6fbc29c9..e403c4c81 100644
--- a/pygsti/tools/matrixtools.py
+++ b/pygsti/tools/matrixtools.py
@@ -894,8 +894,9 @@ def real_matrix_log(m, action_if_imaginary="raise", tol=1e-8):
             pass
         else:
             assert(False), "Invalid 'action_if_imaginary' argument: %s" % action_if_imaginary
-    else:
-        assert(imMag <= tol), "real_matrix_log failed to construct a real logarithm!"
+    elif imMag <= tol:
+        import warnings
+        warnings.warn("real_matrix_log failed to construct a real logarithm!")
         logM = _np.real(logM)
 
     return logM
@@ -1277,11 +1278,18 @@ def _fas(a, inds, rhs, add=False):
     # index-list index is fine too.  The case we need to
     # deal with is indexing a multi-dimensional array with
     # one or more index-lists
-    if all([isinstance(i, (int, slice)) for i in inds]) or len(inds) == 1:
+    if len(inds) == 1:
         if add:
-            a[inds] += rhs  # all integers or slices behave nicely
+            a[inds] += rhs
         else:
-            a[inds] = rhs  # all integers or slices behave nicely
+            a[inds] = rhs
+    elif all([isinstance(i, (int, slice)) for i in inds]):
+        assert len(inds) == rhs.shape[1]
+        for ind, rhs_vec in zip(inds, rhs.T):
+            if add:
+                a[ind] += rhs_vec  # all integers or slices behave nicely
+            else:
+                a[ind]  = rhs_vec  # all integers or slices behave nicely
     else:
         #convert each dimension's index to a list, take a product of
         # these lists, and flatten the right hand side to get the
diff --git a/pygsti/tools/optools.py b/pygsti/tools/optools.py
index a4f2feec1..0355fd7da 100644
--- a/pygsti/tools/optools.py
+++ b/pygsti/tools/optools.py
@@ -27,7 +27,6 @@
 from pygsti.baseobjs.label import Label as _Label
 from pygsti.baseobjs.errorgenlabel import LocalElementaryErrorgenLabel as _LocalElementaryErrorgenLabel
 from pygsti.tools.legacytools import deprecate as _deprecated_fn
-from pygsti import SpaceT
 
 IMAG_TOL = 1e-7  # tolerance for imaginary part being considered zero
 
diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
new file mode 100644
index 000000000..1fc6950dc
--- /dev/null
+++ b/pygsti/tools/sequencetools.py
@@ -0,0 +1,362 @@
+"""
+Tools for finding and using the longest common substrings in order to cache and evaluation order.
+"""
+#***************************************************************************************************
+# Copyright 2015, 2019, 2025 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
+# in this software.
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.  You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
+#***************************************************************************************************
+
+from typing import Sequence, Any, List, Tuple, MutableSequence, Optional
+import numpy as _np
+from tqdm import tqdm
+
+
+def len_lcp(A: Sequence, B: Sequence) -> int:
+    """
+    Returns:
+    -----
+    int - the length of the longest matching prefix between A and B.
+    """
+    i = 0
+    n = len(A)
+    m = len(B)
+    while i < n and i < m:
+        if A[i] != B[i]:
+            return i
+        i += 1
+    return i
+
+
+def _lcs_dp_version(A: Sequence, B: Sequence) -> _np.ndarray:
+    """
+    Compute the longest common substring between A and B using
+    dynamic programming.
+
+    This will use O(n \times m) space and take O(n \times m \times max(m, n)) time.
+    """
+    table = _np.zeros((len(A) + 1, len(B) + 1))
+    n, m = table.shape
+    for i in range(n-2, -1, -1):
+        for j in range(m-2, -1, -1):
+            opt1 = 0
+            if A[i] == B[j]:
+                opt1 = len_lcp(A[i:], B[j:])
+            opt2 = table[i, j+1]
+            opt3 = table[i+1, j]
+            table[i,j] = max(opt1, opt2, opt3)
+    return table
+
+
+def conduct_one_round_of_lcs_simplification(sequences: MutableSequence[MutableSequence[Any]],
+                                            table_data_and_sequences: tuple[_np.ndarray, dict[tuple[int, int], Sequence[Any]]],
+                                            internal_tables_and_sequences: tuple[_np.ndarray, dict[tuple[int, int], Sequence[Any]]],
+                                            starting_cache_num: int,
+                                            cache_struct: dict[int, Any],
+                                            sequence_ind_to_cache_ind: Optional[dict[int, int]] = None):
+    """
+    Simplify the set of sequences by contracting the set of longest common subsequences.
+
+    Will update the list of sequences and the cache struct to hold the longest common subsequences as new sequences.
+    """
+    if not sequence_ind_to_cache_ind:
+        sequence_ind_to_cache_ind = {i: i for i in range(len(sequences))}
+    if table_data_and_sequences:
+        table, external_sequences = table_data_and_sequences
+    else:
+        table_cache = _np.zeros((len(sequences), len(sequences)))
+        table, external_sequences = _compute_lcs_for_every_pair_of_sequences(sequences, table_cache,
+                                                None, set(_np.arange(len(sequences))), "Unknown")
+
+    if internal_tables_and_sequences:
+        internal_subtable, internal_subsequences = internal_tables_and_sequences
+    else:
+        internal_subtable, internal_subsequences = create_tables_for_internal_LCS(sequences)
+
+    best_index = _np.where(table == _np.max(table))
+    best_internal_index = _np.where(internal_subtable == _np.max(internal_subtable))
+    updated_sequences = [seq for seq in sequences]
+    cache_num = starting_cache_num
+
+    # Build sequence dict
+    all_subsequences_to_replace: dict[tuple, dict[int, List[int]]] = {}
+
+    if _np.max(internal_subtable) >= _np.max(table):
+        # We are only going to replace if this was the longest substring.
+        for cir_ind in best_internal_index[0]:
+            for seq in internal_subsequences[cir_ind]:
+                key = tuple(seq)
+                if key in all_subsequences_to_replace:
+                    all_subsequences_to_replace[key][cir_ind] = internal_subsequences[cir_ind][seq]
+                else:
+                    all_subsequences_to_replace[key] = {cir_ind: internal_subsequences[cir_ind][seq]}
+
+    if _np.max(table) >= _np.max(internal_subtable):
+        for ii in range(len(best_index[0])):
+            cir_index = best_index[0][ii]
+            cir_index2 = best_index[1][ii]
+            starting_point, starting_point_2, length = external_sequences[(cir_index, cir_index2)]
+            seq = updated_sequences[cir_index][starting_point: int(starting_point + length)]
+
+            key = tuple(seq)
+            if key in all_subsequences_to_replace:
+                if cir_index not in all_subsequences_to_replace[key]:
+                    # We did not already handle this with internal subsequences.
+                    all_subsequences_to_replace[key][cir_index] = [starting_point]
+                if cir_index2 not in all_subsequences_to_replace[key]:
+                    all_subsequences_to_replace[key][cir_index2] = [starting_point_2]
+
+            else:
+                all_subsequences_to_replace[key] = {cir_index: [starting_point], cir_index2: [starting_point_2]}
+
+
+    # Handle the updates.
+    old_cache_num = cache_num
+    dirty_inds = set()
+    for seq, cdict in all_subsequences_to_replace.items():
+        w = len(seq)
+        update_made = 0
+        if  w > 1 or (not isinstance(seq[0], int)):
+            # We have reached an item which we can just compute.
+            for cir_ind in cdict:
+                my_cir = updated_sequences[cir_ind]
+                sp = 0
+                while sp+w <= len(my_cir):
+                    if list(my_cir[sp: sp+w]) == list(seq):
+                        my_cir[sp: sp + w] = [cache_num]
+                        dirty_inds.add(cir_ind)
+                        update_made = 1
+
+                    sp += 1
+                updated_sequences[cir_ind] = my_cir
+
+                cache_struct[sequence_ind_to_cache_ind[cir_ind]] = updated_sequences[cir_ind]
+
+            if update_made:
+                # There may have been multiple overlapping subsequences in the same sequence.
+                # (e.g. QWEQWEQWERQWE has QWE, WEQ, and EQW all happen and all are length 3 subsequences.)
+                updated_sequences.append(list(seq))
+                cache_struct[cache_num] = updated_sequences[cache_num]
+
+                # This is a new sequence index which will need to be updated.
+                dirty_inds.add(cache_num)
+                sequence_ind_to_cache_ind[cache_num] = cache_num
+                cache_num += 1
+
+    assert cache_num >= old_cache_num
+    assert old_cache_num >=0
+    sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
+
+
+    return updated_sequences, cache_num, cache_struct, sequences_introduced_in_this_round, table, external_sequences, dirty_inds
+
+def simplify_internal_first_one_round(sequences: MutableSequence[MutableSequence[Any]],
+            internal_tables_and_sequences: tuple[_np.ndarray, dict[tuple[int, int], Sequence[Any]]],
+            starting_cache_num: int,
+            cache_struct: dict[int, Any],
+            seq_ind_to_cache_ind: Optional[dict[int, int]]):
+    """
+    Simplify the set of sequences by contracting the set of longest common subsequences internal subsequences.
+
+    e.g. ["AAAA"] will be replaced with cache_num cache_num. But ["BAR", "BAC"] will not update here because "BA" is split between 2 sequences.
+
+    Will update the list of sequences and the cache struct to hold the longest common subsequences as new sequences.
+    
+    Cache number will decrement so ensure that cache_struct can handle positives and negatives.
+    """
+    if not seq_ind_to_cache_ind:
+        seq_ind_to_cache_ind = {i: i for i in range(len(sequences))}
+
+    if internal_tables_and_sequences:
+        internal_subtable, internal_subsequences = internal_tables_and_sequences
+    else:
+        internal_subtable, internal_subsequences = create_tables_for_internal_LCS(sequences)
+
+    best_internal_index = _np.where(internal_subtable == _np.max(internal_subtable))
+    updated_sequences = [seq for seq in sequences]
+    cache_num = starting_cache_num
+
+    # Build sequence dict
+    all_subsequences_to_replace: dict[tuple, dict[int, List[int]]] = {}
+
+    # We are only going to replace if this was the longest substring.
+    for cir_ind in best_internal_index[0]:
+        for seq in internal_subsequences[cir_ind]:
+            key = tuple(seq)
+            if key in all_subsequences_to_replace:
+                all_subsequences_to_replace[key][cir_ind] = internal_subsequences[cir_ind][seq]
+            else:
+                all_subsequences_to_replace[key] = {cir_ind: internal_subsequences[cir_ind][seq]}
+
+    # Handle the updates.
+    old_cache_num = cache_num
+    for seq, cdict in all_subsequences_to_replace.items():
+        w = len(seq)
+        update_made = 0
+        if  w > 1 or (not isinstance(seq[0], int)):
+            # We have reached an item which we can just compute.
+            for cir_ind in cdict:
+                my_cir = updated_sequences[cir_ind]
+                sp = 0
+                while sp+w <= len(my_cir):
+                    if list(my_cir[sp: sp+w]) == list(seq):
+                        my_cir[sp: sp + w] = [cache_num]
+                        update_made = 1
+
+                    sp += 1
+                updated_sequences[cir_ind] = my_cir
+
+                cache_struct[seq_ind_to_cache_ind[cir_ind]] = updated_sequences[cir_ind]
+
+            if update_made:
+                # There may have been multiple overlapping subsequences in the same sequence.
+                # (e.g. QWEQWEQWERQWE has QWE, WEQ, and EQW all happen and all are length 3 subsequences.)
+                updated_sequences.append(list(seq))
+                cache_struct[cache_num] = list(seq) # Add the new sequence to the cache.
+
+                # Add a new mapping from sequences to cache index.
+                seq_ind_to_cache_ind[len(updated_sequences)-1] = cache_num
+
+                cache_num += -1
+
+    # Cache num and old_cache_num < 0
+    assert cache_num < 0
+    assert old_cache_num < 0
+    assert old_cache_num > cache_num
+    sequences_introduced_in_this_round = _np.arange(_np.abs(cache_num - old_cache_num))*-1 + old_cache_num
+
+    return updated_sequences, cache_num, cache_struct, sequences_introduced_in_this_round
+
+
+
+def _find_starting_positions_using_dp_table(
+        dp_table: _np.ndarray
+    ) -> tuple[int, int, int] | Tuple[None, None, None]:
+    """
+    Finds the starting positions for the longest common subsequence.
+
+    Returns:
+    ---------
+    int - starting index in A of LCS(A,B)
+    int - starting index in B of LCS(A,B)
+    int - length of LCS(A,B)
+    """
+    n, m = dp_table.shape
+    i = 0
+    j = 0
+    while i < n-1 and j < m -1:
+        curr = dp_table[i,j]
+        opt1 = dp_table[i+1, j+1] # Use
+        opt2 = dp_table[i+1, j] # Eliminate A prefix
+        opt3 = dp_table[i, j+1] # Eliminate B prefix
+        options = [opt1, opt2, opt3]
+        if _np.all(curr == options):
+            i += 1
+            j += 1
+        elif opt2 > opt1 and opt2 > opt3:
+            i += 1
+        elif opt3 > opt2 and opt3 > opt1:
+            j += 1
+        else:
+            # All three options are equal. So we should march the diagonal.
+            return i, j, dp_table[0,0]
+    return None, None, None
+
+def _lookup_in_sequence_cache(seq_cache: dict[tuple[int, int], tuple], i: int, j: int) -> tuple:
+
+    if seq_cache:
+        return seq_cache[(i, j)]
+    return (None, None, None)
+
+
+def _compute_lcs_for_every_pair_of_sequences(sequences: MutableSequence[Any],
+                                             table_cache: _np.ndarray,
+                                             seq_cache: dict,
+                                             dirty_inds: set,
+                                             expected_best: int):
+    """
+    Computes the LCS for every pair of sequences A,B in sequences
+    """
+    best_subsequences = {}
+    best_lengths = _np.zeros((len(sequences), len(sequences)))
+    curr_best = 2  # We want only subsequences that have at least two characters matching.
+    for i in tqdm(range(len(sequences)-1, -1, -1),
+                  f"LCS_circuits Expected Val {expected_best}: ", disable = True): # Lets do this in reverse order
+        cir0 = sequences[i]
+        for j in range(i-1, -1, -1):
+            cir1 = sequences[j]
+            if i in dirty_inds or j in dirty_inds:
+                if len(cir0) < curr_best or len(cir1) < curr_best:
+                    # Mark pair as dirty to be computed later when it may be the longest subsequence.
+                    best_lengths[i,j] = -1
+                    best_subsequences[(i,j)] = (None, None, None)
+                else:
+                    table = _lcs_dp_version(cir0, cir1)
+                    best_lengths[i,j] = table[0,0]
+                    best_subsequences[(i,j)] = _find_starting_positions_using_dp_table(table)
+                    curr_best = max(best_lengths[i,j], curr_best)
+            else:
+                best_lengths[i,j] = table_cache[i,j]
+                best_subsequences[(i,j)] = _lookup_in_sequence_cache(seq_cache, i, j)
+
+    return best_lengths, best_subsequences
+
+
+def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple, set[int]]]:
+    """
+    Compute the longest common subsequence within a single circuit A.
+
+    Cost ~ O(L^3 / 8) where L is the length of A
+
+    Returns:
+    ---------
+    int - length of longest common subsequences within A
+    dict[tuple, set[int]] - dictionary of subsequences to starting positions within A.
+    """
+    n = len(A)
+    best = 0
+    best_ind : dict[tuple[Any,...], set[int]] = dict()
+    changed = False
+    for w in range(1, int(_np.floor(n / 2) + 1)):
+        for sp in range(n - w):
+            window = A[sp: sp + w]
+            for match in range(sp+ w, n-w + 1):
+                if A[match: match + w] == window:
+                    if best == w:
+                        if tuple(window) in best_ind:
+                            best_ind[tuple(window)].add(match)
+                        else:
+                            best_ind[tuple(window)] = {sp, match}
+                    else:
+                        best_ind = {tuple(window): {sp, match}}
+                        changed = True
+                        best = w
+        if not changed:
+            return best, best_ind
+    return best, best_ind
+
+
+def create_tables_for_internal_LCS(
+        sequences: Sequence[Sequence[Any]]
+    ) -> tuple[
+        _np.ndarray, List[dict[tuple[Any,...], set[int]]]
+    ]:
+    """
+    Compute all the longest common internal sequences for each circuit A in sequences
+
+    Total cost is O(C L^3).
+    """
+
+    C = len(sequences)
+    the_table = _np.zeros(C)
+    seq_table : List[dict[tuple[Any,...], set[int]]] = [dict() for _ in range(C)]
+
+    curr_best = 1
+    for i in range(C):
+        if len(sequences[i]) >= 2*curr_best:
+            the_table[i], seq_table[i] = _longest_common_internal_subsequence(sequences[i])
+            curr_best = max(curr_best, the_table[i])
+    return the_table, seq_table
diff --git a/pygsti/tools/tqdm.py b/pygsti/tools/tqdm.py
new file mode 100644
index 000000000..f3d66fe8d
--- /dev/null
+++ b/pygsti/tools/tqdm.py
@@ -0,0 +1,7 @@
+try:
+    from tqdm import tqdm
+    def our_tqdm(iterator, message):
+        return tqdm(iterator, message)
+except ImportError:
+    def our_tqdm(iterator, ignore):
+        return iterator
diff --git a/test/unit/mpi/run_me_with_mpiexec.py b/test/unit/mpi/run_me_with_mpiexec.py
index 04a781d28..f2f7620a7 100644
--- a/test/unit/mpi/run_me_with_mpiexec.py
+++ b/test/unit/mpi/run_me_with_mpiexec.py
@@ -1,3 +1,4 @@
+
 # This file is designed to be run via: mpiexec -np 4 python -W ignore run_me_with_mpiexec.py
 # This does not use nosetests because I want to set verbosity differently based on rank (quiet if not rank 0)
 # By wrapping asserts in comm.rank == 0, only rank 0 should fail (should help with output)
@@ -11,7 +12,9 @@
 import pygsti
 from pygsti.modelpacks import smq1Q_XYI as std
 
+pygsti.optimize.customsolve.CUSTOM_SOLVE_THRESHOLD = 10
 wcomm = MPI.COMM_WORLD
+print(f'Running with CUSTOM_SOLVE_THRESHOLD = {pygsti.optimize.customsolve.CUSTOM_SOLVE_THRESHOLD}')
 
 
 class ParallelTest(object):
diff --git a/test/unit/objects/test_errorgenpropagation.py b/test/unit/objects/tempdeactivated_test_errorgenpropagation.py
similarity index 100%
rename from test/unit/objects/test_errorgenpropagation.py
rename to test/unit/objects/tempdeactivated_test_errorgenpropagation.py
diff --git a/test/unit/objects/test_circuit_splitting.py b/test/unit/objects/test_circuit_splitting.py
new file mode 100644
index 000000000..5970dc4cd
--- /dev/null
+++ b/test/unit/objects/test_circuit_splitting.py
@@ -0,0 +1,149 @@
+from pygsti.circuits.circuit import Circuit as _Circuit
+from pygsti.baseobjs.label import Label
+from pygsti.circuits.split_circuits_into_lanes import compute_subcircuits, compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit
+import numpy as np
+
+
+def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]) -> _Circuit:
+    """
+    Build a random circuit of depth L which operates on num_qubits and has the allowed
+    single qubit gates specified in allowed gates.
+    """
+    my_circuit = []
+    for lnum in range(depth_L):
+        layer = []
+        for qnum in range(num_qubits):
+            gate = str(np.random.choice(allowed_gates))
+            layer.append((gate, qnum))
+        my_circuit.append(layer)
+    return _Circuit(my_circuit, line_labels=[i for i in range(num_qubits)])
+
+
+def build_circuit_with_multiple_qubit_gates_with_designated_lanes(
+                                num_qubits: int,
+                                depth_L: int,
+                                lane_end_points: list[int],
+                                gates_to_qubits_used: dict[str, int]) -> _Circuit:
+    """
+    Builds a circuit with a known lane structure.
+    Any two + qubit lanes can be split into smaller lanes if none of the gates
+    chosen for that lane actually operate on two or more qubits.
+    """
+
+
+    assert lane_end_points[-1] <= num_qubits # if < then we have a lane from there to num_qubits.
+    assert lane_end_points[0] > 0
+    assert np.all(np.diff(lane_end_points) > 0) # then it is sorted in increasing order.
+
+    if lane_end_points[-1] < num_qubits:
+        lane_end_points.append(num_qubits)
+
+    my_circuit = []
+    n_qs_to_gates_avail = {}
+    for key, val in gates_to_qubits_used.items():
+        if val in n_qs_to_gates_avail:
+            n_qs_to_gates_avail[val].append(key)
+        else:
+            n_qs_to_gates_avail[val] = [key]
+
+    for lnum in range(depth_L):
+        layer = []
+        start_point = 0
+
+        for lane_ep in lane_end_points:
+            num_used: int = 0
+            while num_used < (lane_ep - start_point):
+                navail = (lane_ep - start_point) - num_used
+                nchosen = 0
+                if navail >= max(n_qs_to_gates_avail):
+                    # we can use any gate
+                    nchosen = np.random.randint(1, max(n_qs_to_gates_avail) + 1)
+                else:
+                    # we need to first choose how many to use.
+                    nchosen = np.random.randint(1, navail + 1)
+                gate = str(np.random.choice(n_qs_to_gates_avail[nchosen]))
+                tmp = list(np.random.permutation(nchosen) + num_used + start_point) # Increase to offset.
+                perm_of_qubits_used = [int(tmp[ind]) for ind in range(len(tmp))]
+                if gate == "Gcustom":
+                    layer.append(Label(gate, *perm_of_qubits_used, args=(np.random.random(4)*4*np.pi)))
+                else:
+                    layer.append((gate, *perm_of_qubits_used))
+                num_used += nchosen
+
+            if num_used > (lane_ep - start_point) + 1:
+                print(num_used, f"lane ({start_point}, {lane_ep})")
+                raise AssertionError("lane barrier is broken")
+            
+            start_point = lane_ep
+        my_circuit.append(layer)
+    return _Circuit(my_circuit, line_labels=[i for i in range(num_qubits)])
+
+
+def test_subcircuits_splits_can_create_empty_sub_circuit():
+
+
+    original = _Circuit([], line_labels=[0])
+
+    qubits_to_lanes = {0: 0}
+    lane_to_qubits = {0: (0,)}
+
+    attempt = compute_subcircuits(original, qubits_to_lanes, lane_to_qubits)
+
+    assert original == _Circuit(attempt[0], line_labels=[0])
+
+def test_subcircuits_split_can_be_cached():
+
+    gates_to_num_used = {"X": 1, "Y": 1, "Z": 1, "CNOT": 2, "CZ": 2}
+
+    depth = 10
+    num_qubits = 6
+
+    lane_eps = [1, 2, 4, 5]
+    # So expected lane dist is (0, ), (1), (2,3), (4,), (5,)
+
+    # This is a random circuit so the lanes may not be perfect.
+    circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits, depth, lane_eps, gates_to_num_used)
+
+    qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit)
+
+
+    assert "lanes" in circuit.saved_auxinfo
+
+    assert list(circuit.saved_auxinfo["lanes"].keys()) == [(0, 1, 2, 3, 4, 5)]
+
+    sub_cirs = compute_subcircuits(circuit, qubit_to_lane, lane_to_qubits, cache_lanes_in_circuit=True)
+
+    assert len(circuit.saved_auxinfo["lanes"].keys()) == len(sub_cirs)
+
+
+
+def test_find_qubit_to_lane_splitting():
+
+    gates_to_num_used = {"X": 1, "Y": 1, "Z": 1, "CNOT": 2, "CZ": 2}
+
+    depth = 10
+    num_qubits = 6
+
+    lane_eps = [1, 2, 4, 5]
+    # So expected lane dist is (0, ), (1), (2,3), (4,), (5,)
+    minimum_num_lanes = 5
+
+    # This is a random circuit so the lanes may not be perfect.
+    circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits, depth, lane_eps, gates_to_num_used)
+
+    qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit)
+
+
+    assert len(qubit_to_lane) == num_qubits
+
+    assert len(lane_to_qubits) >= minimum_num_lanes
+    assert len(lane_to_qubits) <= num_qubits
+
+    for qubit in qubit_to_lane:
+        assert qubit_to_lane[qubit] in lane_to_qubits
+
+
+    for lane in lane_to_qubits:
+        for qu in lane_to_qubits[lane]:
+            assert qu in qubit_to_lane
+            assert lane == qubit_to_lane[qu]
diff --git a/test/unit/objects/test_forwardsim.py b/test/unit/objects/test_forwardsim.py
index 5c608baee..0a8292528 100644
--- a/test/unit/objects/test_forwardsim.py
+++ b/test/unit/objects/test_forwardsim.py
@@ -9,18 +9,25 @@
     MapForwardSimulator, SimpleMapForwardSimulator, \
     MatrixForwardSimulator,  SimpleMatrixForwardSimulator, \
     TorchForwardSimulator
+from pygsti.forwardsims.matrixforwardsim import LCSEvalTreeMatrixForwardSimulator
 from pygsti.models import ExplicitOpModel
 from pygsti.circuits import Circuit, create_lsgst_circuit_lists
 from pygsti.baseobjs import Label as L
-from ..util import BaseCase
+try:
+    from ..util import BaseCase
+except ImportError:
+    BaseCase = object
 
 from pygsti.data import simulate_data
-from pygsti.modelpacks import smq1Q_XYI
+from pygsti.modelpacks import smq1Q_XYI, smq1Q_XY, smq2Q_XYZICNOT, smq2Q_XYCNOT
 from pygsti.protocols import gst
 from pygsti.protocols.protocol import ProtocolData
 from pygsti.tools import two_delta_logl
 
 
+GLOBAL_MODEL_PACK = smq1Q_XY
+
+
 def Ls(*args):
     """ Convert args to a tuple to Labels """
     return tuple([L(x) for x in args])
@@ -149,8 +156,8 @@ class BaseProtocolData:
 
     @classmethod
     def setUpClass(cls):
-        cls.gst_design = smq1Q_XYI.create_gst_experiment_design(max_max_length=16)
-        cls.mdl_target = smq1Q_XYI.target_model()
+        cls.gst_design = GLOBAL_MODEL_PACK.create_gst_experiment_design(max_max_length=16)
+        cls.mdl_target = GLOBAL_MODEL_PACK.target_model()
         cls.mdl_datagen = cls.mdl_target.depolarize(op_noise=0.05, spam_noise=0.025)
 
         ds = simulate_data(cls.mdl_datagen, cls.gst_design.all_circuits_needing_data, 20000, sample_error='none')
@@ -255,23 +262,23 @@ def jac_colinearities(self):
             colinearities *= -1
         return colinearities
 
-    
 
 class ForwardSimConsistencyTester(TestCase):
 
     PROBS_TOL = 1e-14
     JACS_TOL = 1e-10
 
+    
     def setUp(self):
-        self.model_ideal = smq1Q_XYI.target_model()
+        self.model_ideal = GLOBAL_MODEL_PACK.target_model()
         if TorchForwardSimulator.ENABLED:
             # TorchFowardSimulator can only work with TP modelmembers.
             self.model_ideal.convert_members_inplace(to_type='full TP')
         
         self.model_noisy = self.model_ideal.depolarize(op_noise=0.05, spam_noise=0.025)
-        prep_fiducials = smq1Q_XYI.prep_fiducials()
-        meas_fiducials = smq1Q_XYI.meas_fiducials()
-        germs = smq1Q_XYI.germs()
+        prep_fiducials = GLOBAL_MODEL_PACK.prep_fiducials()
+        meas_fiducials = GLOBAL_MODEL_PACK.meas_fiducials()
+        germs = GLOBAL_MODEL_PACK.germs()
         max_lengths = [4]
         circuits = create_lsgst_circuit_lists(
             self.model_noisy, prep_fiducials, meas_fiducials, germs, max_lengths
@@ -280,7 +287,8 @@ def setUp(self):
             SimpleMapForwardSimulator(),
             SimpleMatrixForwardSimulator(),
             MapForwardSimulator(),
-            MatrixForwardSimulator()
+            MatrixForwardSimulator(),
+            LCSEvalTreeMatrixForwardSimulator()
         ]
         if TorchForwardSimulator.ENABLED:
             sims.append(TorchForwardSimulator())
@@ -334,9 +342,9 @@ class ForwardSimIntegrationTester(BaseProtocolData):
 
     def _run(self, obj : ForwardSimulator.Castable):
         self.setUpClass()
-        proto = gst.GateSetTomography(smq1Q_XYI.target_model("full TP"), 'stdgaugeopt', name="testGST")
+        proto = gst.GateSetTomography(GLOBAL_MODEL_PACK.target_model("full TP"), name="testGST")
         results = proto.run(self.gst_data, simulator=obj)
-        mdl_result = results.estimates["testGST"].models['stdgaugeopt']
+        mdl_result = results.estimates["testGST"].models["final iteration estimate"]
         twoDLogL = two_delta_logl(mdl_result, self.gst_data.dataset)
         assert twoDLogL <= 0.05  # should be near 0 for perfect data
         pass
@@ -359,3 +367,11 @@ def test_map_fwdsim(self):
     def test_matrix_fwdsim(self):
         self._run(MatrixForwardSimulator)
 
+    def test_lcs_matrix_fwdsim(self):
+        self._run(LCSEvalTreeMatrixForwardSimulator)
+
+
+if __name__ == '__main__':
+    tester = ForwardSimConsistencyTester()
+    tester.test_consistent_probs()
+    print()
diff --git a/test/unit/objects/test_forwardsim_on_implicitop_model.py b/test/unit/objects/test_forwardsim_on_implicitop_model.py
new file mode 100644
index 000000000..9b811c696
--- /dev/null
+++ b/test/unit/objects/test_forwardsim_on_implicitop_model.py
@@ -0,0 +1,714 @@
+import numpy as np
+from tqdm import tqdm
+
+
+from pygsti.baseobjs import qubitgraph as _qgraph
+from pygsti.baseobjs import QubitSpace
+from pygsti.models import modelconstruction as pgmc
+from pygsti.processors import QubitProcessorSpec
+from pygsti.modelmembers.states import ComposedState, ComputationalBasisState
+from pygsti.modelmembers.povms import ComposedPOVM
+from pygsti.modelmembers.operations import LindbladErrorgen, ExpErrorgenOp
+from pygsti.baseobjs.errorgenbasis import CompleteElementaryErrorgenBasis
+from pygsti.circuits import Circuit
+from pygsti.algorithms import BuiltinBasis
+from pygsti.tools import unitary_to_superop
+from pygsti.baseobjs import Label
+from pygsti.modelmembers import operations as op
+from pygsti.baseobjs import UnitaryGateFunction
+from pygsti.forwardsims.matrixforwardsim import LCSEvalTreeMatrixForwardSimulator
+from pygsti.forwardsims import MapForwardSimulator, MatrixForwardSimulator
+
+
+def assert_probability_densities_are_equal(op_dict: dict, exp_dict: dict, cir: Circuit):
+
+    for key, val in op_dict.items():
+        assert key in exp_dict
+        assert np.allclose(val, exp_dict[key]), f"Circuit {cir}, Outcome {key}, Expected: {exp_dict[key]}, Got: {val}"
+        # try:
+        # except:
+            # breakpoint()
+            # raise AssertionError()
+
+def convert_dict_of_dist_to_array(my_dictionary: dict) -> np.ndarray:
+
+    out = [[] for _ in my_dictionary.keys()]
+
+    for i, key in enumerate(sorted(my_dictionary.keys())):
+        val = my_dictionary[key]
+        if isinstance(val, dict):
+            val = convert_dict_of_dist_to_array(val)
+        out[i] = val
+
+    return np.array(out)
+
+#region Model Construction
+def construct_arbitrary_single_qubit_unitary(alpha, beta, gamma, delta):
+
+    first_term = np.exp(alpha * 1j)
+    left_mat = np.array([np.exp(-1j * beta / 2), 0, 0, np.exp(1j * beta / 2)]).reshape(2, 2)
+    rotate_mat = np.array([np.cos(gamma / 2), -np.sin(gamma / 2), np.sin(gamma / 2), np.cos(gamma / 2)]).reshape(2, 2)
+    right_mat = np.array([np.exp(-1j * delta / 2), 0, 0, np.exp(1j * delta / 2)]).reshape(2, 2)
+
+    my_matrix = first_term * (left_mat @ (rotate_mat @ right_mat))
+
+    assert np.allclose(np.conjugate(my_matrix.T) @ my_matrix, np.eye(2))
+
+    return my_matrix
+
+
+class MyContinuouslyParameterizedGateFunction(UnitaryGateFunction):
+    shape = (2, 2)
+
+    def __call__(self, alpha, beta, gamma, delta):
+        return construct_arbitrary_single_qubit_unitary(alpha, beta, gamma, delta)
+
+
+class ArbParameterizedOpFactory(op.OpFactory):
+    def __init__(self, state_space, location: int):
+        op.OpFactory.__init__(self, state_space=state_space, evotype="densitymx")
+        self.my_state_space = state_space
+        self.interesting_qubit = location
+
+    def create_object(self, args=None, sslbls=None):
+        assert (len(args) == 4)
+        alpha, beta, gamma, delta = args
+
+        unitary = construct_arbitrary_single_qubit_unitary(alpha, beta, gamma, delta)
+
+        superop = unitary_to_superop(unitary)
+        return op.EmbeddedOp(state_space=self.my_state_space,
+                             target_labels=self.interesting_qubit,
+                             operation_to_embed=op.StaticArbitraryOp(superop))
+
+
+def make_spam(num_qubits):
+    state_space = QubitSpace(num_qubits)
+    max_weights = {'H': 1, 'S': 1, 'C': 1, 'A': 1}
+    egbn_H_only = CompleteElementaryErrorgenBasis(BuiltinBasis("PP", 4), state_space, ('H', ), max_weights)
+
+    rho_errgen_rates = {ell: 0.0 for ell in egbn_H_only.labels}
+    rho_lindblad = LindbladErrorgen.from_elementary_errorgens(rho_errgen_rates,
+                                                              parameterization='H',
+                                                              state_space=state_space,
+                                                              evotype='densitymx')
+    rho_errorgen = ExpErrorgenOp(rho_lindblad)
+    rho_ideal = ComputationalBasisState([0] * num_qubits)
+    rho = ComposedState(rho_ideal, rho_errorgen)
+
+    M_errgen_rates = {ell: 0.0 for ell in egbn_H_only.labels}
+    M_lindblad = LindbladErrorgen.from_elementary_errorgens(M_errgen_rates,
+                                                            parameterization='H',
+                                                            state_space=state_space,
+                                                            evotype='densitymx')
+    M_errorgen = ExpErrorgenOp(M_lindblad)
+    M = ComposedPOVM(M_errorgen)
+
+    return rho, M
+
+
+def make_target_model(num_qubits,
+                      independent_gates: bool = True,
+                      arbitrary_unit: bool = False,
+                      simplify_for_dprobs: bool = True):
+
+    ps_geometry = _qgraph.QubitGraph.common_graph(
+        num_qubits, geometry='line',
+        directed=True, all_directions=True,
+        qubit_labels=tuple(range(num_qubits))
+    )
+    u_ecr = 1 / np.sqrt(2) * np.array([[0, 0, 1, 1j],
+                                       [0, 0, 1j, 1],
+                                       [1, -1j, 0, 0],
+                                       [-1j, 1, 0, 0]])
+    gatenames = ['Gxpi2', 'Gypi2', 'Gzpi2', 'Gi', 'Gii', 'Gecr', "Gcnot", "Gswap"]
+    if simplify_for_dprobs:
+        gatenames = ['Gxpi2', 'Gypi2', 'Gzpi2', 'Gi', 'Gii', 'Gecr']
+
+    ps = QubitProcessorSpec(
+        num_qubits=num_qubits,
+        gate_names=gatenames,
+        nonstd_gate_unitaries={'Gecr': u_ecr, 'Gii': np.eye(4)},
+        geometry=ps_geometry
+    )
+    gateerrs = dict()
+    basis = BuiltinBasis("PP", QubitSpace(1))
+    egb1 = CompleteElementaryErrorgenBasis(basis, QubitSpace(1), ('H', 'S'))
+    for gn in gatenames[:-1]:
+        gateerrs[gn] = {ell: 0 for ell in egb1.labels}
+    egb2 = CompleteElementaryErrorgenBasis(basis, QubitSpace(2), ('H', 'S'))
+    gateerrs['Gecr'] = {ell: 0 for ell in egb2.labels}
+    gateerrs['Gii'] = gateerrs['Gecr']
+
+    if not simplify_for_dprobs:
+        gateerrs["Gswap"] = gateerrs["Gecr"]
+        gateerrs["Gcnot"] = gateerrs["Gecr"]
+
+    tmn = pgmc.create_crosstalk_free_model(ps, lindblad_error_coeffs=gateerrs, independent_gates=independent_gates)
+
+    rho, M = make_spam(num_qubits)
+    tmn.prep_blks['layers']['rho0'] = rho
+    tmn.povm_blks['layers']['Mdefault'] = M
+    tmn._rebuild_paramvec()
+
+    # tmn._layer_rules.implicit_idle_mode = "pad_1Q"
+
+    if arbitrary_unit:
+        for i in range(num_qubits):
+            Ga_factory = ArbParameterizedOpFactory(state_space=QubitSpace(num_qubits), location=(i, ))
+            tmn.factories["layers"][("Gcustom", i)] = Ga_factory  # add in the factory for every qubit.
+
+    return tmn
+
+
+def build_models_for_testing(num_qubits, independent_gates: bool = False, simplify_for_dprobs: bool = False):
+
+    tgt_model = make_target_model(num_qubits,
+                                  independent_gates=independent_gates,
+                                  simplify_for_dprobs=simplify_for_dprobs)
+
+    # target_model.sim.calclib = pygsti.forwardsims.mapforwardsim_calc_generic
+    tgt_model.sim = LCSEvalTreeMatrixForwardSimulator()
+
+    tgt_model2 = tgt_model.copy()
+    tgt_model2.sim = MapForwardSimulator()
+    # make_target_model(num_qubits, independent_gates=independent_gates, simplify_for_dprobs=simplify_for_dprobs)
+
+    return tgt_model, tgt_model2
+
+
+#endregion Model Construction
+
+
+#region Building Random Circuits
+
+def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]) -> Circuit:
+    my_circuit = []
+    for lnum in range(depth_L):
+        layer = []
+        for qnum in range(num_qubits):
+            gate = str(np.random.choice(allowed_gates))
+            layer.append((gate, qnum))
+        my_circuit.append(layer)
+    return Circuit(my_circuit)
+
+
+def build_circuit_with_arbitrarily_random_single_qubit_gates(num_qubits: int, depth_L: int) -> Circuit:
+
+    my_circuit = []
+    gate_name = "Gcustom"
+
+    full_args = np.random.random((depth_L, num_qubits, 4)) * 4 * np.pi  # Need to be in [0, 2 \pi] for the half angles.
+
+    for lnum in range(depth_L):
+        layer = []
+        for qnum in range(num_qubits):
+            gate = Label(gate_name, qnum, args=(full_args[lnum, qnum]))
+            layer.append(gate)
+        my_circuit.append(layer)
+    return Circuit(my_circuit, num_lines=num_qubits)
+
+
+def build_circuit_with_multiple_qubit_gates_with_designated_lanes(
+        num_qubits: int, depth_L: int,
+        lane_end_points: list[int], gates_to_qubits_used: dict[str, int]) -> Circuit:
+
+    assert lane_end_points[-1] <= num_qubits  # if < then we have a lane from there to num_qubits.
+    assert lane_end_points[0] > 0
+    assert np.all(np.diff(lane_end_points) > 0)  # then it is sorted in increasing order.
+
+    if lane_end_points[-1] < num_qubits:
+        lane_end_points.append(num_qubits)
+
+    my_circuit = []
+    n_qs_to_gates_avail = {}
+    for key, val in gates_to_qubits_used.items():
+        if val in n_qs_to_gates_avail:
+            n_qs_to_gates_avail[val].append(key)
+        else:
+            n_qs_to_gates_avail[val] = [key]
+
+    for lnum in range(depth_L):
+        layer = []
+        start_point = 0
+
+        for lane_ep in lane_end_points:
+            num_used: int = 0
+            while num_used < (lane_ep - start_point):
+                navail = (lane_ep - start_point) - num_used
+                nchosen = 0
+                if navail >= max(n_qs_to_gates_avail):
+                    # we can use any gate
+                    nchosen = np.random.randint(1, max(n_qs_to_gates_avail) + 1)
+                else:
+                    # we need to first choose how many to use.
+                    nchosen = np.random.randint(1, navail + 1)
+                gate = str(np.random.choice(n_qs_to_gates_avail[nchosen]))
+                tmp = list(np.random.permutation(nchosen) + num_used + start_point)  # Increase to offset.
+                perm_of_qubits_used = [int(tmp[ind]) for ind in range(len(tmp))]
+                if gate == "Gcustom":
+                    layer.append(Label(gate, *perm_of_qubits_used, args=(np.random.random(4) * 4 * np.pi)))
+                else:
+                    layer.append((gate, *perm_of_qubits_used))
+                num_used += nchosen
+
+            if num_used > (lane_ep - start_point) + 1:
+                print(num_used, f"lane ({start_point}, {lane_ep})")
+                raise AssertionError("lane barrier is broken")
+
+            start_point = lane_ep
+        my_circuit.append(layer)
+    return Circuit(my_circuit, num_lines=num_qubits)
+
+
+def build_circuit_with_multiple_qubit_gates(num_qubits: int,
+                                            depth_L: int,
+                                            gates_to_qubits_used: dict[str, int],
+                                            starting_qubit: int = 0):
+
+    my_circuit = []
+    n_qs_to_gates_avail = {}
+    for key, val in gates_to_qubits_used.items():
+        if val in n_qs_to_gates_avail:
+            n_qs_to_gates_avail[val].append(key)
+        else:
+            n_qs_to_gates_avail[val] = [key]
+
+    for lnum in range(depth_L):
+        layer = []
+        num_used: int = 0
+        while num_used < num_qubits:
+            navail = num_qubits - num_used
+            nchosen = 0
+            if navail >= max(n_qs_to_gates_avail):
+                # we can use any gate
+                nchosen = np.random.randint(1, max(n_qs_to_gates_avail) + 1)
+            else:
+                # we need to first choose how many to use.
+                nchosen = np.random.randint(1, navail + 1)
+            gate = str(np.random.choice(n_qs_to_gates_avail[nchosen]))
+            tmp = list(np.random.permutation(nchosen) + num_used)  # Increase to offset.
+            perm_of_qubits_used = [int(tmp[ind]) for ind in range(len(tmp))]
+            if gate == "Gcustom":
+                layer.append(Label(gate, * perm_of_qubits_used, args=(np.random.random(4) * 4 * np.pi)))
+            else:
+                layer.append((gate, * perm_of_qubits_used))
+            num_used += nchosen
+
+        my_circuit.append(layer)
+    return Circuit(my_circuit, num_lines=num_qubits)
+
+#endregion Building Random Circuits
+
+
+#region Consistency of Probability
+def test_tensor_product_single_unitaries_yield_right_results():
+
+    num_qubits = 4
+
+    under_test, expected_model = build_models_for_testing(num_qubits)
+
+    circuitNone = Circuit([], num_lines=num_qubits)
+    circuitX = Circuit([("Gxpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
+
+    for cir in [circuitNone, circuitX, circuitY, circuitZ, circuitIdle]:
+        probs = under_test.probabilities(cir)
+        exp = expected_model.probabilities(cir)
+
+        assert_probability_densities_are_equal(probs, exp, cir)
+
+
+def test_tensor_product_single_unitaries_random_collection_of_xyz():
+
+    for qb in range(2, 6):
+
+        under_test, expected_model = build_models_for_testing(qb)
+        allowed_gates = ['Gxpi2', 'Gypi2', "Gzpi2", 'Gi']
+
+        circuit100 = build_circuit(qb, 100, allowed_gates=allowed_gates)
+
+        probs = under_test.probabilities(circuit100)
+        exp = expected_model.probabilities(circuit100)
+
+        assert_probability_densities_are_equal(probs, exp, circuit100)
+
+
+def test_tensor_product_two_qubit_gates_bulk():
+
+    num_qubits = 4
+
+    under_test, expected_model = build_models_for_testing(num_qubits)
+
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    circ_list = [circuitECR01, circuitECR10]
+
+    probs = under_test.sim.bulk_probs(circ_list)
+    exp = expected_model.sim.bulk_probs(circ_list)
+
+    for cir in circ_list:
+        assert_probability_densities_are_equal(probs[cir], exp[cir], cir)
+
+
+def test_tensor_product_two_qubit_gates():
+
+    num_qubits = 4
+
+    under_test, expected_model = build_models_for_testing(num_qubits)
+
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    for cir in [circuitECR01, circuitECR10]:
+        probs = under_test.probabilities(cir)
+        exp = expected_model.probabilities(cir)
+
+        assert_probability_densities_are_equal(probs, exp, cir)
+
+
+def test_tensor_product_gates_with_implicit_idles():
+
+    num_qubits = 5
+
+    under_test, expected_model = build_models_for_testing(num_qubits)
+
+    gatenames = ["Gxpi2", "Gypi2", "Gzpi2", "Gi"]
+    for gate in gatenames:
+        for i in range(num_qubits):
+            cir = Circuit([[(gate, i)]], num_lines=num_qubits)
+
+            probs = under_test.probabilities(cir)
+            exp = expected_model.probabilities(cir)
+            assert_probability_densities_are_equal(probs, exp, cir)
+
+    # Now for the two qubit gates. Gecr and GCNOT
+
+    # gatenames = ["Gecr", "Gcnot"]
+    gatenames = ["Gecr"]
+    for gate in gatenames:
+        for i in range(num_qubits - 1):
+            cir = Circuit([[(gate, i, i + 1)]], num_lines=num_qubits)
+
+            probs = under_test.probabilities(cir)
+            exp = expected_model.probabilities(cir)
+            assert_probability_densities_are_equal(probs, exp, cir)
+
+            # Order swapped.
+            cir = Circuit([[(gate, i + 1, i)]], num_lines=num_qubits)
+
+            probs = under_test.probabilities(cir)
+            exp = expected_model.probabilities(cir)
+            assert_probability_densities_are_equal(probs, exp, cir)
+
+
+def test_tensor_product_multi_qubit_gates_with_structured_lanes():
+
+    gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1, 'Gswap': 2, 'Gcnot': 2, 'Gecr': 2}
+    for qb in range(5, 6):
+
+        lanes = [1, 2, 4]
+
+        under_test, expected_model = build_models_for_testing(qb)
+
+        circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(qb,
+                                                                                100,
+                                                                                lanes,
+                                                                                gates_to_used_qubits)
+
+        probs = under_test.probabilities(circuit)
+        exp = expected_model.probabilities(circuit)
+
+        assert_probability_densities_are_equal(probs, exp, circuit)
+#endregion Probabilities Consistency tests
+
+
+#region D Probabilities Consistency Tests
+
+def test_tensor_product_two_qubit_gates_dprobs():
+
+    num_qubits = 4
+
+    under_test, expected_model = build_models_for_testing(num_qubits, simplify_for_dprobs=True)
+
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    for cir in [circuitECR01, circuitECR10]:
+        probs = under_test.sim.dprobs(cir)
+        exp = expected_model.sim.dprobs(cir)
+
+        assert_probability_densities_are_equal(probs, exp, cir)
+
+def test_tensor_product_two_qubit_gates_dprobs_bulk():
+
+    num_qubits = 4
+
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True, simplify_for_dprobs=True)
+
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    circ_list = [circuitECR01, circuitECR10]
+    exp = expected_model.sim.bulk_dprobs(circ_list)
+
+    dprobs = under_test.sim.bulk_dprobs(circ_list)
+
+
+    for cir in circ_list:
+        assert_probability_densities_are_equal(dprobs[cir], exp[cir], cir)
+
+test_tensor_product_two_qubit_gates_dprobs_bulk()
+
+def test_tensor_product_single_unitaries_yield_right_results_dprobs():
+    num_qubits = 2
+
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True, simplify_for_dprobs=True)
+
+    circuitNone = Circuit([], num_lines=num_qubits)
+    single_layer = tuple([("Gxpi2", i) for i in range(num_qubits)])
+    circuitX = Circuit([single_layer], num_lines=num_qubits)
+    circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
+
+    circuits = [circuitNone, circuitX, circuitY, circuitZ, circuitIdle]
+    for cir in circuits:
+        probs = under_test.sim.dprobs(cir)
+        exp = expected_model.sim.dprobs(cir)
+
+        assert_probability_densities_are_equal(probs, exp, cir)
+
+def test_tensor_product_single_unitaries_yield_right_results_dprobs_bulk():
+
+    num_qubits = 2
+
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True, simplify_for_dprobs=True)
+
+    circuitNone = Circuit([], num_lines=num_qubits)
+    single_layer = tuple([("Gxpi2", i) for i in range(num_qubits)])
+    circuitX = Circuit([single_layer], num_lines=num_qubits)
+    circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
+
+    circuits = [circuitNone, circuitX, circuitY, circuitZ, circuitIdle]
+
+    probs = under_test.sim.bulk_dprobs(circuits)
+    exp = expected_model.sim.bulk_dprobs(circuits)
+
+
+    for cir in circuits:
+        assert_probability_densities_are_equal(probs[cir], exp[cir], cir)
+
+# test_tensor_product_single_unitaries_yield_right_results_dprobs_bulk()
+
+def test_tensor_product_single_unitaries_random_collection_of_xyz_dprobs():
+
+    for qb in range(2, 4):
+
+        under_test, expected_model = build_models_for_testing(qb, independent_gates=True, simplify_for_dprobs=True)
+        allowed_gates = ['Gxpi2', 'Gypi2', "Gzpi2", 'Gi']
+
+        circuit100 = build_circuit(qb, 15, allowed_gates=allowed_gates)
+
+        probs = under_test.sim.dprobs(circuit100)
+        exp = expected_model.sim.dprobs(circuit100)
+
+        assert_probability_densities_are_equal(probs, exp, circuit100)
+
+
+def test_tensor_product_gates_with_implicit_idles_dprobs():
+
+    num_qubits = 2
+
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True, simplify_for_dprobs=True)
+
+    gatenames = ["Gxpi2", "Gypi2", "Gzpi2", "Gi"]
+    for gate in tqdm(gatenames, "Gate: "):
+        for i in tqdm(range(num_qubits), "Qubit Location: "):
+            cir = Circuit([[(gate, i)]], num_lines=num_qubits)
+
+            probs = under_test.sim.dprobs(cir)
+            exp = expected_model.sim.dprobs(cir)
+            assert_probability_densities_are_equal(probs, exp, cir)
+
+    # Now for the two qubit gates. Gecr and GCNOT
+
+    # gatenames = ["Gecr", "Gcnot"]
+    gatenames = ["Gecr"]
+    gatenames = []
+    for gate in gatenames:
+        for i in range(num_qubits - 1):
+            cir = Circuit([[(gate, i, i + 1)]], num_lines=num_qubits)
+
+            probs = under_test.sim.dprobs(cir)
+            exp = expected_model.sim.dprobs(cir)
+            assert_probability_densities_are_equal(probs, exp, cir)
+
+            # Order swapped.
+            cir = Circuit([[(gate, i + 1, i)]], num_lines=num_qubits)
+
+            probs = under_test.sim.dprobs(cir)
+            exp = expected_model.sim.dprobs(cir)
+            assert_probability_densities_are_equal(probs, exp, cir)
+
+
+def test_tensor_product_multi_qubit_gates_with_structured_lanes_dprobs():
+
+    gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1, 'Gswap': 2, 'Gcnot': 2, 'Gecr': 2}
+    gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1, 'Gecr': 2}
+    for qb in range(5, 6):
+
+        lanes = [1, 2, 4]
+
+        under_test, expected_model = build_models_for_testing(qb, independent_gates=True, simplify_for_dprobs=True)
+
+        circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(qb,
+                                                                                10,
+                                                                                lanes,
+                                                                                gates_to_used_qubits)
+
+        probs = under_test.sim.dprobs(circuit)
+        exp = expected_model.sim.dprobs(circuit)
+
+        assert_probability_densities_are_equal(probs, exp, circuit)
+
+#endregion Derivative of Probabilities consistencies.
+
+
+
+
+def test_reconstruct_full_matrices_returns_in_correct_order():
+
+    num_qubits = 4
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True)
+
+    circ_list = []
+    circuitX = Circuit([("Gxpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    circ_list = [circuitX, circuitY, circuitZ, circuitIdle, circuitECR10, circuitECR01]
+
+    layout = under_test.sim.create_layout(circ_list, array_types='ep')
+
+    atom = layout.atoms[0]
+    atom.tree.collapse_circuits_to_process_matrices(under_test, None)
+
+    Gs, _ = atom.tree.reconstruct_full_matrices(under_test, None)
+
+    expected_vals = []
+    for cir in circ_list:
+        val = np.eye(4**num_qubits)
+        for i in range(len(cir)):
+            term = expected_model.circuit_layer_operator(cir[i]).to_dense()
+            val = term @ val
+
+        expected_vals.append(val)
+
+    
+    for i, cir in enumerate(circ_list):
+
+        kron_version = Gs[i].to_full_array()
+        assert np.allclose(kron_version, expected_vals[i])
+
+    print()
+
+def test_application_is_equivalent():
+
+    num_qubits = 4
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True)
+
+    circ_list = []
+    circuitX = Circuit([("Gxpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    circ_list = [circuitX, circuitY, circuitZ, circuitIdle, circuitECR10, circuitECR01]
+
+    layout = under_test.sim.create_layout(circ_list, array_types='ep')
+
+    atom = layout.atoms[0]
+    atom.tree.collapse_circuits_to_process_matrices(under_test, None)
+
+    Gs, _ = atom.tree.reconstruct_full_matrices(under_test, None)
+
+    rhs = np.eye(4**num_qubits)
+    for i, cir in enumerate(circ_list):
+
+        kron_version = Gs[i].to_full_array()
+        
+        full = kron_version @ rhs
+        the_trick = Gs[i] @ rhs
+
+        assert np.allclose(the_trick, full)
+
+    print()
+    
+def test_reconstructed_vals_update_with_dprobs():
+
+    num_qubits = 4
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True)
+
+    circ_list = []
+    circuitX = Circuit([("Gxpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    circ_list = [circuitX, circuitY, circuitZ, circuitIdle, circuitECR10, circuitECR01]
+    circ_list = [circuitX, circuitY, circuitIdle, circuitECR01, circuitECR10]
+
+    layout = under_test.sim.create_layout(circ_list, array_types='ep')
+
+    atom = layout.atoms[0]
+    atom.tree.collapse_circuits_to_process_matrices(under_test, None)
+
+    original_Gs, _ = atom.tree.reconstruct_full_matrices(under_test, None)
+
+    eps = 1e-7
+
+    orig_vec = under_test.to_vector().copy()
+    for i in range(under_test.num_params):
+
+        new_vec = orig_vec.copy()
+        new_vec[i] += eps
+
+        under_test.from_vector(new_vec)
+        expected_model.from_vector(new_vec)
+
+        atom.tree.collapse_circuits_to_process_matrices(under_test, i)
+
+        new_Gs, _ = atom.tree.reconstruct_full_matrices(under_test, None) # Return all of them
+
+        expected_vals = []
+        for cir in circ_list:
+            val = np.eye(4**num_qubits)
+            for j in range(len(cir)):
+                term = expected_model.circuit_layer_operator(cir[j]).to_dense()
+                val = term @ val
+
+            expected_vals.append(val)
+
+    
+        for j, cir in enumerate(circ_list):
+
+            kron_version = new_Gs[j].to_full_array()
+            assert np.allclose(kron_version, expected_vals[j])
+
+    print()
+
+
+test_reconstructed_vals_update_with_dprobs()
\ No newline at end of file
diff --git a/test/unit/tools/test_sequencetools.py b/test/unit/tools/test_sequencetools.py
new file mode 100644
index 000000000..ab8d7dd4c
--- /dev/null
+++ b/test/unit/tools/test_sequencetools.py
@@ -0,0 +1,132 @@
+import numpy as np
+from pygsti.tools.sequencetools import _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
+from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification
+from pygsti.tools.sequencetools import simplify_internal_first_one_round
+
+def test_external_matches():
+
+    my_strings = ["ABAARCR12LIO", "QWERTYASDFGH", "QWEELLKJAT"]
+
+    tables, sequences = _compute_lcs_for_every_pair_of_sequences(my_strings, None, None, set([0,1,2]), 3)
+
+    assert np.max(tables) == 3
+
+    assert len(np.where(np.max(tables) == tables)[0]) == 1 # There is only one sequence present in this case.
+
+
+    if (1,2) in sequences:
+        assert sequences[(1,2)] == (0, 0, 3)
+    else:
+        assert (2,1) in sequences
+        assert sequences[(2,1)] == (0, 0, 3)
+
+
+def test_internal_matches():
+
+    my_strings = ["RACECAR", "AAAAQAAAA", "QWERTYQWEQWEQWE"]
+
+    tables, sequences = create_tables_for_internal_LCS(my_strings)
+
+    assert np.max(tables) == 4
+
+
+    assert sequences[1][tuple("AAAA")] == {0, 5}
+
+
+    my_strings = [my_strings[0]] + [my_strings[2]]
+
+    tables, sequences = create_tables_for_internal_LCS(my_strings)
+
+    assert np.max(tables) == 3
+    assert sequences[1][tuple("QWE")] == {0, 6, 9, 12}
+
+
+def test_one_round_update_collecting_tables_first():
+
+    example = [('R', 'A', 'C', 'E', 'C', 'A', 'R'),
+    ('A', 'A', 'A', 'A', 'Q', 'A', 'A', 'A', 'A'),
+    ('Q', 'W', 'E', 'R', 'T', 'Y', 'Q', 'W', 'E', 'Q', 'W', 'E', 'Q', 'W', 'E')]
+    example = [list(x) for x in example]
+    internal = create_tables_for_internal_LCS(example)
+    external = _compute_lcs_for_every_pair_of_sequences(example, None, None, set([0,1,2]), 3)
+
+    cache = {i: s for i,s in enumerate(example)}
+    updated, num, cache, seq_intro, ext_table, ext_seq, ext_dirty = conduct_one_round_of_lcs_simplification(example, external, internal, len(example), cache)
+
+    assert len(updated) == 4
+    assert "".join(updated[3]) == "AAAA"
+
+    assert cache[1] == [3,"Q",3]
+    assert np.allclose(seq_intro, np.array(3))
+
+    assert num == len(updated)
+
+
+def test_one_round_update_without_collecting_tables_first():
+
+    example = [('R', 'A', 'C', 'E', 'C', 'A', 'R'),
+    ('A', 'A', 'A', 'A', 'Q', 'A', 'A', 'A', 'A'),
+    ('Q', 'W', 'E', 'R', 'T', 'Y', 'Q', 'W', 'E', 'Q', 'W', 'E', 'Q', 'W', 'E')]
+    example = [list(x) for x in example]
+
+
+    cache = {i: s for i,s in enumerate(example)}
+    updated, num, cache, seq_intro, ext_table, ext_seq, ext_dirty = conduct_one_round_of_lcs_simplification(example, None, None, len(example), cache)
+
+    assert len(updated) == 4
+    assert "".join(updated[3]) == "AAAA"
+
+    assert cache[1] == [3,"Q",3]
+    assert np.allclose(seq_intro, np.array(3))
+
+    assert num == len(updated)
+
+
+def test_update_only_adds_those_strings_which_are_actually_used():
+    example = [('R', 'A', 'C', 'E', 'C', 'A', 'R'),
+        ('A', 'A', 'A', 'A', 'Q', 'A', 'A', 'A', 'A'),
+        ('Q', 'W', 'E', 'R', 'T', 'Y', 'Q', 'W', 'E', 'Q', 'W', 'E', 'Q', 'W', 'E')]
+    example = [list(x) for x in example]
+
+
+    cache = {i: s for i,s in enumerate(example)}
+    updated, num, cache, seq_intro, ext_table, ext_seq, ext_dirty = conduct_one_round_of_lcs_simplification(example, None, None, len(example), cache)
+
+    r2, num, c2, s2, ext_table, ext_seq, ext_dirty = conduct_one_round_of_lcs_simplification(updated, None, None, num, cache)
+
+    assert len(r2) == num
+
+    assert len(s2) == 1
+
+    assert 4 in c2[2]
+
+    assert len(c2[4]) == 3
+
+def test_multiple_successive_internal_updates_first():
+
+    strings_list = [list("IIIIIIAIIIIII")]
+
+    cache = {}
+    seq_ind_to_cache_ind = {}
+    updated_string_list, cache_num, cache, seq_intro = simplify_internal_first_one_round(strings_list,
+                                                                                         None,
+                                                                                         -1,
+                                                                                         cache,
+                                                                                         seq_ind_to_cache_ind)
+
+    assert -1 in cache.keys()
+
+    assert -2 == cache_num
+    assert len(updated_string_list) == 2
+
+    updated_string_list, cache_num, cache, seq_intro = simplify_internal_first_one_round(updated_string_list, 
+                                                                                         None,
+                                                                                         cache_num,
+                                                                                         cache,
+                                                                                         seq_ind_to_cache_ind)
+
+    assert -2 in cache.keys()
+    assert -3 == cache_num
+
+    assert len(updated_string_list) == 3
+    assert np.allclose(seq_intro, [-2])
\ No newline at end of file