From a93a435bca563632142822b91253dd37f15be75a Mon Sep 17 00:00:00 2001
From: Dale Roberts <dale.o.roberts@gmail.com>
Date: Tue, 14 Apr 2026 14:16:58 +1000
Subject: [PATCH] Memory and performance improvements

- Matrix class: packed/symmetric storage, attach-to-mmap, calloc allocator
- LM solver + Anderson acceleration CLI
- Static build fixes (glibc 2.34+ pthread)
- Matrix test suite
---
 dynadjust/CMakeLists.txt                      |   56 +-
 .../dynadjust/dnaadjust/dnaadjust-multi.cpp   |   13 +-
 .../dynadjust/dnaadjust/dnaadjust-stage.cpp   |  118 +-
 dynadjust/dynadjust/dnaadjust/dnaadjust.cpp   | 1507 +++++++++++++----
 dynadjust/dynadjust/dnaadjust/dnaadjust.hpp   |  152 +-
 .../dnaadjustwrapper/dnaadjustprogress.cpp    |   25 +-
 .../dnaadjustwrapper/dnaadjustwrapper.cpp     |  190 ++-
 .../include/config/dnaoptions-interface.hpp   |   17 +
 dynadjust/include/config/dnaoptions.hpp       |   25 +-
 .../functions/dnatemplatematrixfuncs.hpp      |    4 +-
 .../include/math/dnamatrix_contiguous.cpp     |  791 ++++++++-
 .../include/math/dnamatrix_contiguous.hpp     |  183 +-
 dynadjust/include/memory/dnafile_mapping.cpp  |   78 +-
 dynadjust/include/memory/dnafile_mapping.hpp  |   14 +-
 tests/test_matrix.cpp                         |  945 +++++++++++
 15 files changed, 3546 insertions(+), 572 deletions(-)

diff --git a/dynadjust/CMakeLists.txt b/dynadjust/CMakeLists.txt
index c6d523264..b3bdb844a 100644
--- a/dynadjust/CMakeLists.txt
+++ b/dynadjust/CMakeLists.txt
@@ -195,7 +195,7 @@ else()
 endif()
 message(STATUS "Found XercesC (${XercesC_INCLUDE_DIRS} ${XercesC_LIBRARIES})")
 
-# Static builds need ICU for Xerces transcoding (glibc iconv can't dlopen gconv modules in static binaries)
+# Static builds need ICU for Xerces transcoding (glibc iconv can't dlopen gconv modules in static binaries).
 if(BUILD_STATIC AND UNIX AND NOT APPLE)
     find_package(ICU COMPONENTS uc data)
     if(ICU_FOUND)
@@ -1132,6 +1132,53 @@ if (BUILD_TESTING)
     add_test (NAME test-urban-phased-network COMMAND $<TARGET_FILE:${DNADIFF_TARGET}> urban.phased.adj urban.phased.adj.expected --skip-to-marker "M Station 1" -t 0.001)
     add_test (NAME test-urban-thread-network COMMAND $<TARGET_FILE:${DNADIFF_TARGET}> urban_mt.phased-mt.adj urban_mt.phased-mt.adj.expected --skip-to-marker "M Station 1" -t 0.01 -v)
 
+    # ........................................................................
+    # LM / Trust-Region solver tests
+    # ........................................................................
+
+    # LM-1. GNSS simultaneous with LM — should match GN results
+    add_test (NAME import-gnss-lm COMMAND $<TARGET_FILE:${DNAIMPORT_TARGET}> -n gnss_lm gnss-network.stn gnss-network.msr -r GDA2020 --flag-unused-stations --quiet)
+    add_test (NAME geoid-gnss-lm COMMAND $<TARGET_FILE:${DNAGEOID_TARGET}> gnss_lm -g gnss-network-geoid.gsb --export-dna-geo)
+    add_test (NAME reftran-gnss-lm COMMAND $<TARGET_FILE:${DNAREFTRAN_TARGET}> gnss_lm --export-dna --export-xml)
+    add_test (NAME adjust-gnss-lm COMMAND $<TARGET_FILE:${DNAADJUST_TARGET}> gnss_lm --output-adj-msr --scale-normals-to-unity --lm-enabled)
+    add_test (NAME test-gnss-lm-vs-expected COMMAND $<TARGET_FILE:${DNADIFF_TARGET}> gnss_lm.simult.adj gnss.simult.adj.expected --skip-headers 52 -t 0.001)
+
+    # LM-2. Urban phased with LM — should match GN results
+    add_test (NAME import-urban-lm COMMAND $<TARGET_FILE:${DNAIMPORT_TARGET}> -n urban_lm urban-network.stn urban-network.msr --flag-unused-stations --quiet)
+    add_test (NAME geoid-urban-lm COMMAND $<TARGET_FILE:${DNAGEOID_TARGET}> urban_lm -g urban-network-geoid.gsb --export-dna-geo)
+    add_test (NAME segment-urban-lm COMMAND $<TARGET_FILE:${DNASEGMENT_TARGET}> urban_lm --min 50 --max 150)
+    add_test (NAME adjust-urban-lm COMMAND $<TARGET_FILE:${DNAADJUST_TARGET}> urban_lm --output-adj-msr --phased --lm-enabled)
+    add_test (NAME test-urban-lm-vs-expected COMMAND $<TARGET_FILE:${DNADIFF_TARGET}> urban_lm.phased.adj urban.phased.adj.expected --skip-to-marker "M Station 1" -t 0.001)
+
+    # LM-3. Urban multi-threaded phased with LM — should match GN results
+    add_test (NAME import-urban-mt-lm COMMAND $<TARGET_FILE:${DNAIMPORT_TARGET}> -n urban_mt_lm urban-network.stn urban-network.msr --quiet)
+    add_test (NAME reftran-urban-mt-lm COMMAND $<TARGET_FILE:${DNAREFTRAN_TARGET}> urban_mt_lm -r gda2020)
+    add_test (NAME geoid-urban-mt-lm COMMAND $<TARGET_FILE:${DNAGEOID_TARGET}> urban_mt_lm -g urban-network-geoid.gsb)
+    add_test (NAME segment-urban-mt-lm COMMAND $<TARGET_FILE:${DNASEGMENT_TARGET}> urban_mt_lm --min 50 --max 85)
+    add_test (NAME adjust-urban-mt-lm COMMAND $<TARGET_FILE:${DNAADJUST_TARGET}> urban_mt_lm --output-adj-msr --multi --lm-enabled)
+    add_test (NAME test-urban-mt-lm-vs-expected COMMAND $<TARGET_FILE:${DNADIFF_TARGET}> urban_mt_lm.phased-mt.adj urban_mt.phased-mt.adj.expected --skip-to-marker "M Station 1" -t 0.01)
+
+    # LM-4. DSG distance-only simultaneous — multi-iteration stress test
+    add_test (NAME import-dsg-lm COMMAND $<TARGET_FILE:${DNAIMPORT_TARGET}> -n dsg_lm dsg.stn dsg.msr --flag-unused-stations --quiet)
+    add_test (NAME adjust-dsg-lm COMMAND $<TARGET_FILE:${DNAADJUST_TARGET}> dsg_lm --output-adj-msr --max-iterations 50 --lm-enabled)
+
+    # LM-5. DSG distance-only simultaneous — GN baseline for comparison
+    add_test (NAME import-dsg-gn COMMAND $<TARGET_FILE:${DNAIMPORT_TARGET}> -n dsg_gn dsg.stn dsg.msr --flag-unused-stations --quiet)
+    add_test (NAME adjust-dsg-gn COMMAND $<TARGET_FILE:${DNAADJUST_TARGET}> dsg_gn --output-adj-msr --max-iterations 50)
+    add_test (NAME test-dsg-lm-vs-gn COMMAND $<TARGET_FILE:${DNADIFF_TARGET}> dsg_lm.simult.adj dsg_gn.simult.adj --skip-headers 52 -t 0.01)
+
+    # LM-6. DSG distance-only phased with LM
+    add_test (NAME import-dsg-ph-lm COMMAND $<TARGET_FILE:${DNAIMPORT_TARGET}> -n dsg_ph_lm dsg.stn dsg.msr --flag-unused-stations --quiet)
+    add_test (NAME segment-dsg-ph-lm COMMAND $<TARGET_FILE:${DNASEGMENT_TARGET}> dsg_ph_lm --min 10 --max 30)
+    add_test (NAME adjust-dsg-ph-lm COMMAND $<TARGET_FILE:${DNAADJUST_TARGET}> dsg_ph_lm --output-adj-msr --phased --max-iterations 50 --lm-enabled)
+
+    # LM-7. GNSS simultaneous with large initial lambda (robustness check)
+    add_test (NAME import-gnss-lm-large-lambda COMMAND $<TARGET_FILE:${DNAIMPORT_TARGET}> -n gnss_lm_ll gnss-network.stn gnss-network.msr -r GDA2020 --flag-unused-stations --quiet)
+    add_test (NAME geoid-gnss-lm-large-lambda COMMAND $<TARGET_FILE:${DNAGEOID_TARGET}> gnss_lm_ll -g gnss-network-geoid.gsb --export-dna-geo)
+    add_test (NAME reftran-gnss-lm-large-lambda COMMAND $<TARGET_FILE:${DNAREFTRAN_TARGET}> gnss_lm_ll --export-dna --export-xml)
+    add_test (NAME adjust-gnss-lm-large-lambda COMMAND $<TARGET_FILE:${DNAADJUST_TARGET}> gnss_lm_ll --output-adj-msr --scale-normals-to-unity --lm-enabled --lm-lambda-init 10.0)
+    add_test (NAME test-gnss-lm-large-lambda COMMAND $<TARGET_FILE:${DNADIFF_TARGET}> gnss_lm_ll.simult.adj gnss.simult.adj.expected --skip-headers 52 -t 0.001)
+
     # 8. Source tag preservation in --export-xml (issue #317)
     # Import XML data with <Source> tags and verify they are preserved in export
     add_test (NAME import-source-test COMMAND $<TARGET_FILE:${DNAIMPORT_TARGET}> -n srctest source-test-stn.xml source-test-msr.xml --export-xml -r GDA2020)
@@ -1998,6 +2045,13 @@ if (BUILD_TESTING)
     set_tests_properties(test-urban-phased-network PROPERTIES DEPENDS adjust-urban-network)
     set_tests_properties(test-urban-thread-network PROPERTIES DEPENDS adjust-urban-network-thread-01)
 
+    # LM test dependencies
+    set_tests_properties(test-gnss-lm-vs-expected PROPERTIES DEPENDS adjust-gnss-lm)
+    set_tests_properties(test-urban-lm-vs-expected PROPERTIES DEPENDS adjust-urban-lm)
+    set_tests_properties(test-urban-mt-lm-vs-expected PROPERTIES DEPENDS adjust-urban-mt-lm)
+    set_tests_properties(test-dsg-lm-vs-gn PROPERTIES DEPENDS "adjust-dsg-lm;adjust-dsg-gn")
+    set_tests_properties(test-gnss-lm-large-lambda PROPERTIES DEPENDS adjust-gnss-lm-large-lambda)
+
     set_tests_properties(ref-itrf-pmm-06 PROPERTIES DEPENDS ref-itrf-pmm-05)
     #set_tests_properties(ref-itrf-pmm-07 PROPERTIES DEPENDS ref-itrf-pmm-06)
 
diff --git a/dynadjust/dynadjust/dnaadjust/dnaadjust-multi.cpp b/dynadjust/dynadjust/dnaadjust/dnaadjust-multi.cpp
index 7e01b3372..c9d5a96f8 100644
--- a/dynadjust/dynadjust/dnaadjust/dnaadjust-multi.cpp
+++ b/dynadjust/dynadjust/dnaadjust/dnaadjust-multi.cpp
@@ -91,6 +91,12 @@ bool prepareAdjustmentExceptionThrown(const std::vector<std::exception_ptr>& pre
 //
 void dna_adjust::AdjustPhasedMultiThread()
 {
+	if (projectSettings_.a.lm_enabled)
+	{
+		AdjustPhasedMultiThreadLM();
+		return;
+	}
+
 	initialiseIteration();
 
 	std::string corr_msg;
@@ -295,7 +301,7 @@ void dna_adjust::SolveMT(bool COMPUTE_INVERSE, const UINT32& block)
 	{
 		// Compute inverse of normals (aposteriori variance matrix)
 		// (AT * V-1 * A)-1
-		FormInverseVarianceMatrix(&(v_normalsR_.at(block)), false);
+		FormInverseVarianceMatrix(&(v_normalsR_.at(block)), false, true);
 	}
 
 	// compute weighted "measured minus computed"
@@ -304,7 +310,10 @@ void dna_adjust::SolveMT(bool COMPUTE_INVERSE, const UINT32& block)
 
 	// Solve corrections from normal equations
 	v_correctionsR_.at(block).redim(v_designR_.at(block).columns(), 1);
-	v_correctionsR_.at(block).multiply(v_normalsR_.at(block), "N", At_Vinv_m, "N");
+	if (v_normalsR_.at(block).is_symmetric())
+		v_correctionsR_.at(block).multiply_sym(v_normalsR_.at(block), At_Vinv_m);
+	else
+		v_correctionsR_.at(block).multiply(v_normalsR_.at(block), "N", At_Vinv_m, "N");
 
 	// debug output?
 	if (projectSettings_.g.verbose > 3)
diff --git a/dynadjust/dynadjust/dnaadjust/dnaadjust-stage.cpp b/dynadjust/dynadjust/dnaadjust/dnaadjust-stage.cpp
index e0e1175d9..c5e18e255 100644
--- a/dynadjust/dynadjust/dnaadjust/dnaadjust-stage.cpp
+++ b/dynadjust/dynadjust/dnaadjust/dnaadjust-stage.cpp
@@ -173,7 +173,7 @@ void dna_adjust::DeserialiseBlockFromMappedFile(const UINT32& block, const int f
 			break;
 		case sf_normals_r:
 			addr = normalsR_map_.GetBlockRegionAddr(block);
-			v_normalsR_.at(block).ReadMappedFileRegion(addr);
+			v_normalsR_.at(block).AttachMappedFileRegion(addr);
 			break;
 		case sf_atvinv:
 			v_AtVinv_.at(block).allocate();
@@ -183,47 +183,47 @@ void dna_adjust::DeserialiseBlockFromMappedFile(const UINT32& block, const int f
 			break;
 		case sf_meas_minus_comp:
 			addr = measMinusComp_map_.GetBlockRegionAddr(block);
-			v_measMinusComp_.at(block).ReadMappedFileRegion(addr);
+			v_measMinusComp_.at(block).AttachMappedFileRegion(addr);
 			break;
 		case sf_estimated_stns:
 			addr = estimatedStations_map_.GetBlockRegionAddr(block);
-			v_estimatedStations_.at(block).ReadMappedFileRegion(addr);
+			v_estimatedStations_.at(block).AttachMappedFileRegion(addr);
 			break;
 		case sf_original_stns:
 			addr = originalStations_map_.GetBlockRegionAddr(block);
-			v_originalStations_.at(block).ReadMappedFileRegion(addr);
+			v_originalStations_.at(block).AttachMappedFileRegion(addr);
 			break;
 		case sf_rigorous_stns:
 			addr = rigorousStations_map_.GetBlockRegionAddr(block);
-			v_rigorousStations_.at(block).ReadMappedFileRegion(addr);
+			v_rigorousStations_.at(block).AttachMappedFileRegion(addr);
 			break;
 		case sf_junction_vars:
 			addr = junctionVariances_map_.GetBlockRegionAddr(block);
-			v_junctionVariances_.at(block).ReadMappedFileRegion(addr);
+			v_junctionVariances_.at(block).AttachMappedFileRegion(addr);
 			break;
 		case sf_junction_vars_f:
 			addr = junctionVariancesFwd_map_.GetBlockRegionAddr(block);
-			v_junctionVariancesFwd_.at(block).ReadMappedFileRegion(addr);
+			v_junctionVariancesFwd_.at(block).AttachMappedFileRegion(addr);
 			break;
 		case sf_junction_ests_f:
 			addr = junctionEstimatesFwd_map_.GetBlockRegionAddr(block);
-			v_junctionEstimatesFwd_.at(block).ReadMappedFileRegion(addr);
+			v_junctionEstimatesFwd_.at(block).AttachMappedFileRegion(addr);
 			break;
 		case sf_junction_ests_r:
 			addr = junctionEstimatesRev_map_.GetBlockRegionAddr(block);
-			v_junctionEstimatesRev_.at(block).ReadMappedFileRegion(addr);
+			v_junctionEstimatesRev_.at(block).AttachMappedFileRegion(addr);
 			break;
 		case sf_rigorous_vars:
 			addr = rigorousVariances_map_.GetBlockRegionAddr(block);
-			v_rigorousVariances_.at(block).ReadMappedFileRegion(addr);
+			v_rigorousVariances_.at(block).AttachMappedFileRegion(addr);
 			break;
 		case sf_prec_adj_msrs:
 			addr = precAdjMsrs_map_.GetBlockRegionAddr(block);
-			v_precAdjMsrsFull_.at(block).ReadMappedFileRegion(addr);
+			v_precAdjMsrsFull_.at(block).AttachMappedFileRegion(addr);
 			break;
 		case sf_corrections:
 			addr = corrections_map_.GetBlockRegionAddr(block);
-			v_corrections_.at(block).ReadMappedFileRegion(addr);
+			v_corrections_.at(block).AttachMappedFileRegion(addr);
 
 			if (v_blockMeta_.at(block)._blockLast)
 				v_correctionsR_.at(block).allocate();
@@ -240,8 +240,8 @@ void dna_adjust::SerialiseBlockToMappedFile(const UINT32& block, const int file_
 	{
 		// serialise all.  That is, call this function again, but with 
 		// arguments for all files
-		SerialiseBlockToMappedFile(block, 16, 
-			sf_normals, sf_normals_r, sf_atvinv, sf_design, sf_meas_minus_comp,
+		SerialiseBlockToMappedFile(block, 12,
+			sf_normals_r, sf_meas_minus_comp,
 			sf_estimated_stns, sf_original_stns, sf_rigorous_stns,
 			sf_junction_vars, sf_junction_vars_f, sf_junction_ests_f, sf_junction_ests_r,
 			sf_rigorous_vars, sf_prec_adj_msrs, sf_corrections);
@@ -257,16 +257,10 @@ void dna_adjust::SerialiseBlockToMappedFile(const UINT32& block, const int file_
 	{
 		switch (va_arg(vlist, int))
 		{
-		case sf_normals:
-			break;
 		case sf_normals_r:
 			addr = normalsR_map_.GetBlockRegionAddr(block);
 			v_normalsR_.at(block).WriteMappedFileRegion(addr);
 			break;
-		case sf_atvinv:
-			break;
-		case sf_design:
-			break;
 		case sf_meas_minus_comp:
 			addr = measMinusComp_map_.GetBlockRegionAddr(block);
 			v_measMinusComp_.at(block).WriteMappedFileRegion(addr);
@@ -395,7 +389,11 @@ void dna_adjust::OpenStageFileStreams(const int file_count, ...)
 	}
 
 	std::stringstream ss;
-	ss << projectSettings_.g.output_folder << FOLDER_SLASH << projectSettings_.g.network_name << "-";
+	if (projectSettings_.a.stage_path.empty())
+		ss << projectSettings_.g.output_folder;
+	else
+		ss << projectSettings_.a.stage_path;
+	ss << FOLDER_SLASH << projectSettings_.g.network_name << "-";
 	std::string filePath(ss.str());
 
 	v_stageFileStreams_.clear();
@@ -851,6 +849,46 @@ void dna_adjust::OffloadBlockToMappedFile(const UINT32& block)
 
 	// Unload block matrix data from memory
 	UnloadBlock(block);
+
+	// Advise kernel that mapped pages for this block are no longer needed,
+	// freeing page cache for upcoming blocks
+	AdviseBlockDontNeed(block);
+}
+
+
+void dna_adjust::AdviseBlockDontNeed(const UINT32& block)
+{
+	using advice = boost::interprocess::mapped_region::advice_types;
+	normalsR_map_.AdviseRegion(block, advice::advice_dontneed);
+	measMinusComp_map_.AdviseRegion(block, advice::advice_dontneed);
+	estimatedStations_map_.AdviseRegion(block, advice::advice_dontneed);
+	originalStations_map_.AdviseRegion(block, advice::advice_dontneed);
+	rigorousStations_map_.AdviseRegion(block, advice::advice_dontneed);
+	junctionVariances_map_.AdviseRegion(block, advice::advice_dontneed);
+	junctionVariancesFwd_map_.AdviseRegion(block, advice::advice_dontneed);
+	junctionEstimatesFwd_map_.AdviseRegion(block, advice::advice_dontneed);
+	junctionEstimatesRev_map_.AdviseRegion(block, advice::advice_dontneed);
+	rigorousVariances_map_.AdviseRegion(block, advice::advice_dontneed);
+	precAdjMsrs_map_.AdviseRegion(block, advice::advice_dontneed);
+	corrections_map_.AdviseRegion(block, advice::advice_dontneed);
+}
+
+
+void dna_adjust::AdviseBlockWillNeed(const UINT32& block)
+{
+	using advice = boost::interprocess::mapped_region::advice_types;
+	normalsR_map_.AdviseRegion(block, advice::advice_willneed);
+	measMinusComp_map_.AdviseRegion(block, advice::advice_willneed);
+	estimatedStations_map_.AdviseRegion(block, advice::advice_willneed);
+	originalStations_map_.AdviseRegion(block, advice::advice_willneed);
+	rigorousStations_map_.AdviseRegion(block, advice::advice_willneed);
+	junctionVariances_map_.AdviseRegion(block, advice::advice_willneed);
+	junctionVariancesFwd_map_.AdviseRegion(block, advice::advice_willneed);
+	junctionEstimatesFwd_map_.AdviseRegion(block, advice::advice_willneed);
+	junctionEstimatesRev_map_.AdviseRegion(block, advice::advice_willneed);
+	rigorousVariances_map_.AdviseRegion(block, advice::advice_willneed);
+	precAdjMsrs_map_.AdviseRegion(block, advice::advice_willneed);
+	corrections_map_.AdviseRegion(block, advice::advice_willneed);
 }
 	
 
@@ -896,7 +934,7 @@ void dna_adjust::UnloadBlock(const UINT32& block, const int file_count, ...)
 	if (file_count == 0)
 	{
 		// deserialise all
-		UnloadBlock(block, 16, 
+		UnloadBlock(block, 15,
 			sf_normals, sf_normals_r, sf_atvinv, sf_design, sf_meas_minus_comp,
 			sf_estimated_stns, sf_original_stns, sf_rigorous_stns,
 			sf_junction_vars, sf_junction_vars_f, sf_junction_ests_f, sf_junction_ests_r,
@@ -907,58 +945,60 @@ void dna_adjust::UnloadBlock(const UINT32& block, const int file_count, ...)
 	va_list vlist;
 	va_start(vlist, file_count);
 
-	// Unload block matrix data from memory
+	// Unload block matrix data from memory by freeing buffers.
+	// Use deallocate() instead of explicit destructor calls to keep
+	// the matrix objects in a valid state for later reuse.
 	for (UINT16 file(0); file<file_count; ++file)
 	{
 		switch (va_arg(vlist, int))
 		{
 		case sf_normals:
-			v_normals_.at(block).~matrix_2d();
+			v_normals_.at(block).deallocate();
 			break;
 		case sf_normals_r:
-			v_normalsR_.at(block).~matrix_2d();
+			v_normalsR_.at(block).deallocate();
 			break;
 		case sf_atvinv:
-			v_AtVinv_.at(block).~matrix_2d();
+			v_AtVinv_.at(block).deallocate();
 			break;
 		case sf_design:
-			v_design_.at(block).~matrix_2d();
+			v_design_.at(block).deallocate();
 			break;
 		case sf_meas_minus_comp:
-			v_measMinusComp_.at(block).~matrix_2d();
+			v_measMinusComp_.at(block).deallocate();
 			break;
 		case sf_estimated_stns:
-			v_estimatedStations_.at(block).~matrix_2d();
+			v_estimatedStations_.at(block).deallocate();
 			break;
 		case sf_original_stns:
-			v_originalStations_.at(block).~matrix_2d();
+			v_originalStations_.at(block).deallocate();
 			break;
 		case sf_rigorous_stns:
-			v_rigorousStations_.at(block).~matrix_2d();
+			v_rigorousStations_.at(block).deallocate();
 			break;
 		case sf_junction_vars:
-			v_junctionVariances_.at(block).~matrix_2d();
+			v_junctionVariances_.at(block).deallocate();
 			break;
 		case sf_junction_vars_f:
-			v_junctionVariancesFwd_.at(block).~matrix_2d();
+			v_junctionVariancesFwd_.at(block).deallocate();
 			break;
 		case sf_junction_ests_f:
-			v_junctionEstimatesFwd_.at(block).~matrix_2d();
+			v_junctionEstimatesFwd_.at(block).deallocate();
 			break;
 		case sf_junction_ests_r:
-			v_junctionEstimatesRev_.at(block).~matrix_2d();
+			v_junctionEstimatesRev_.at(block).deallocate();
 			break;
 		case sf_rigorous_vars:
-			v_rigorousVariances_.at(block).~matrix_2d();
+			v_rigorousVariances_.at(block).deallocate();
 			break;
 		case sf_prec_adj_msrs:
-			v_precAdjMsrsFull_.at(block).~matrix_2d();
+			v_precAdjMsrsFull_.at(block).deallocate();
 			break;
 		case sf_corrections:
-			v_corrections_.at(block).~matrix_2d();
+			v_corrections_.at(block).deallocate();
 
 			if (v_blockMeta_.at(block)._blockLast)
-				v_correctionsR_.at(block).~matrix_2d();
+				v_correctionsR_.at(block).deallocate();
 			break;
 		}
 	}
diff --git a/dynadjust/dynadjust/dnaadjust/dnaadjust.cpp b/dynadjust/dynadjust/dnaadjust/dnaadjust.cpp
index dc931b268..d0e44c561 100644
--- a/dynadjust/dynadjust/dnaadjust/dnaadjust.cpp
+++ b/dynadjust/dynadjust/dnaadjust/dnaadjust.cpp
@@ -32,6 +32,9 @@ concurrent_queue<UINT32> combineAdjustmentQueue;
 concurrent_queue<UINT32> prepareAdjustmentQueue;
 std::exception_ptr fwd_error, rev_error, cmb_error, prep_error;
 
+void dna_adjust::SetMaxBlasThreads(int n) { dynadjust::math::set_max_blas_threads(n); }
+int dna_adjust::GetMaxBlasThreads() { return dynadjust::math::get_max_blas_threads(); }
+
 dna_adjust::dna_adjust()
 	: isPreparing_(false)
 	, isAdjusting_(false)
@@ -42,6 +45,7 @@ dna_adjust::dna_adjust()
 	, isAdjustmentQuestionable_(false)
 	, blockCount_(1)
 	, currentBlock_(0)
+	, lastBlockElapsedMs_(0)
 	, total_time_(0)
 	, adjustStatus_(ADJUST_SUCCESS)
 	, currentIteration_(0)
@@ -172,8 +176,15 @@ UINT32& dna_adjust::incrementIteration()
 	return ++currentIteration_; 
 }
 
-void dna_adjust::initialiseIteration(const UINT32& iteration) 
-{ 
+void dna_adjust::decrementIteration()
+{
+	std::lock_guard<std::mutex> lock(current_iterationMutex);
+	if (currentIteration_ > 0)
+		--currentIteration_;
+}
+
+void dna_adjust::initialiseIteration(const UINT32& iteration)
+{
 	std::lock_guard<std::mutex> lock(current_iterationMutex);
 	currentIteration_ = iteration; 
 }
@@ -240,6 +251,7 @@ void dna_adjust::PrepareAdjustment(const project_settings& projectSettings)
 	isPreparing_ = true;
 	isAdjusting_ = true;
 	isCombining_ = false;
+	rebuildingDesign_ = false;
 	isFirstTimeAdjustment_ = true;
 	projectSettings_ = projectSettings;
 
@@ -528,14 +540,14 @@ void dna_adjust::UpdateAdjustment(bool iterate)
 			// Hence, only design and msr-comp are required so as to compute stats
 			if (!iterate)
 				continue;
-			
+
 			switch (projectSettings_.a.adjust_mode)
 			{
 			case PhasedMode:
 			case Phased_Block_1Mode:
 				v_normals_.at(block).zero();
 				UpdateNormals(block, false);
-				
+
 				if (projectSettings_.a.multi_thread)
 				{
 					v_estimatedStationsR_.at(block) = v_rigorousStations_.at(block);
@@ -545,9 +557,9 @@ void dna_adjust::UpdateAdjustment(bool iterate)
 				}
 
 				// Back up normals.  This copy contains the contributions from all
-				// apriori measurement variances, excluding parameter station 
+				// apriori measurement variances, excluding parameter station
 				// variances and junction station variances
-				v_normalsR_.at(block) = v_normals_.at(block);				
+				v_normalsR_.at(block) = v_normals_.at(block);
 				AddConstraintStationstoNormalsForward(block);
 
 				break;
@@ -742,17 +754,17 @@ void dna_adjust::PrepareStationandVarianceMatrices(const UINT32& block)
 
 			// resize rigorous coordinate estimate array
 			v_rigorousStations_.at(block).redim(v_unknownsCount_.at(block), 1);
-		
-			// resize rigorous variances
-			v_rigorousVariances_.at(block).redim(v_unknownsCount_.at(block), v_unknownsCount_.at(block));
+
+			if (projectSettings_.a.stage || projectSettings_.a.multi_thread)
+				v_rigorousVariances_.at(block).redim_packed(v_unknownsCount_.at(block));
 
 			// resize junction variances
-			v_junctionVariances_.at(block).redim(j, j);
+			v_junctionVariances_.at(block).redim_packed(j);
 
 			// resize junction station coordinate estimate arrays
 			if (!v_blockMeta_.at(block)._blockLast && !v_blockMeta_.at(block)._blockIsolated)
 			{
-				v_junctionVariancesFwd_.at(block).redim(j, j);
+				v_junctionVariancesFwd_.at(block).redim_packed(j);
 				v_junctionEstimatesFwd_.at(block).redim(j, 1);
 				v_junctionEstimatesRev_.at(block+1).redim(j, 1);
 			}
@@ -860,11 +872,10 @@ void dna_adjust::PrepareDesignAndMsrMnsCmpMatrices(const UINT32& block)
 	// Simultaneous, phased (multithreaded and block1) adjustments
 
 	// Redim all matrices
-	v_normals_.at(block).redim(v_unknownsCount_.at(block), v_unknownsCount_.at(block));
+	v_normals_.at(block).redim_packed(v_unknownsCount_.at(block));
 	
 	v_design_.at(block).redim(v_measurementCount_.at(block), v_unknownsCount_.at(block));
 	v_corrections_.at(block).redim(v_unknownsCount_.at(block), 1);
-	v_precAdjMsrsFull_.at(block).redim(v_measurementVarianceCount_.at(block), 1);
 	
 	// All blocks in v_correctionsR_ are used for multi thread mode, but only the last is used
 	// in single thread mode
@@ -897,9 +908,7 @@ void dna_adjust::PrepareDesignAndMsrMnsCmpMatricesStage(const UINT32& block)
 	if (projectSettings_.a.recreate_stage_files || !bms_meta_.reduced)
 	{
 		// Redim NormalsR
-		v_normalsR_.at(block).redim(
-			v_unknownsCount_.at(block), 
-			v_unknownsCount_.at(block));
+		v_normalsR_.at(block).redim_packed(v_unknownsCount_.at(block));
 
 		// Redimension the corrections matrices
 		v_corrections_.at(block).redim(v_unknownsCount_.at(block), 1);
@@ -930,7 +939,41 @@ void dna_adjust::PrepareDesignAndMsrMnsCmpMatricesStage(const UINT32& block)
 		// Creating new memory mapped files?  Form the design matrices
 		FillDesignNormalMeasurementsMatrices(true, block, false);
 }
-	
+
+
+void dna_adjust::RebuildDesignAndAtVinv(const UINT32& block)
+{
+	UINT32 pseudoMsrElemCount(0);
+
+	if (!v_blockMeta_.at(block)._blockIsolated)
+	{
+		UINT32 pseudoMsrCount = static_cast<UINT32>(v_JSL_.at(block).size());
+		if (!v_blockMeta_.at(block)._blockFirst)
+			pseudoMsrCount += static_cast<UINT32>(v_JSL_.at(block-1).size());
+		pseudoMsrElemCount = pseudoMsrCount * 3;
+	}
+
+	v_design_.at(block).redim(v_measurementCount_.at(block), v_unknownsCount_.at(block));
+	v_AtVinv_.at(block).redim(
+		v_unknownsCount_.at(block),
+		v_measurementCount_.at(block) + pseudoMsrElemCount);
+
+	if (pseudoMsrElemCount > 0)
+		v_AtVinv_.at(block).shrink(0, pseudoMsrElemCount);
+
+	assert(v_design_.at(block).getbuffer() != nullptr && "RebuildDesignAtVinv: design null after redim");
+	assert(v_AtVinv_.at(block).getbuffer() != nullptr && "RebuildDesignAtVinv: AtVinv null after redim");
+	assert(v_estimatedStations_.at(block).getbuffer() != nullptr && "RebuildDesignAtVinv: estimatedStations null");
+	assert(v_measMinusComp_.at(block).getbuffer() != nullptr && "RebuildDesignAtVinv: measMinusComp null");
+
+	rebuildingDesign_ = true;
+	FillDesignNormalMeasurementsMatrices(true, block, false, true);
+	rebuildingDesign_ = false;
+
+	assert(v_design_.at(block).getbuffer() != nullptr && "RebuildDesignAtVinv: design null after Fill");
+	assert(v_AtVinv_.at(block).getbuffer() != nullptr && "RebuildDesignAtVinv: AtVinv null after Fill");
+}
+
 
 // Re-form At * V-1 for next block using estimated junction parameter station variances
 // nextBlock = currentBlock+1
@@ -1440,7 +1483,7 @@ void dna_adjust::AddMsrtoNormalsCoVar3(const UINT32& design_row, const UINT32& s
 
 			normals->elementadd(stn3+row, stn1+col,
 				AtVinv->get(stn3+row, design_row) * design->get(design_row, stn1+col));
-			
+
 			// 2-3
 			normals->elementadd(stn2+row, stn3+col,
 				AtVinv->get(stn2+row, design_row) * design->get(design_row, stn3+col));
@@ -2382,9 +2425,20 @@ void dna_adjust::CloseOutputFiles()
 
 void dna_adjust::AdjustSimultaneous()
 {
+	if (projectSettings_.a.lm_enabled)
+	{
+		AdjustSimultaneousLM();
+		return;
+	}
+
 	adjustStatus_ = ADJUST_SUCCESS;
 	initialiseIteration();
-	
+
+	// Reset oscillation diagnostics
+	corrPrev_.clear();
+	stnOscCount_.clear();
+	oscHistory_.clear();
+
 	std::ostringstream ss;
 	std::string corr_msg;
 	std::chrono::milliseconds elapsed_time(std::chrono::milliseconds(0));
@@ -2430,6 +2484,7 @@ void dna_adjust::AdjustSimultaneous()
 		// Compute and print largest correction
 		maxCorr_ = v_corrections_.at(0).compute_maximum_value();
 		OutputLargestCorrection(corr_msg);
+		UpdateIterationDiagnostics();
 
 		// update data for messages
 		iterationCorrections_.add_message(corr_msg);
@@ -2491,7 +2546,7 @@ void dna_adjust::ValidateandFinaliseAdjustment(cpu_timer& tot_time)
 		fabs(maxCorr_) > projectSettings_.a.iteration_threshold)
 		adjustStatus_ = ADJUST_MAX_ITERATIONS_EXCEEDED;
 
-	// Back up simultaneous rigorous variance estimates (for serialising 
+	// Back up simultaneous rigorous variance estimates (for serialising
 	// to disk at SerialiseAdjustedVarianceMatrices() ), so that executing adjust
 	// in report-results mode has access to the latest variance estimates
 	switch (projectSettings_.a.adjust_mode)
@@ -2524,15 +2579,30 @@ void dna_adjust::PrintAdjustmentTime(cpu_timer& time, _TIMER_TYPE_ timerType)
 
 void dna_adjust::AdjustPhased()
 {
+	if (projectSettings_.a.lm_enabled)
+	{
+		AdjustPhasedLM();
+		return;
+	}
+
 	initialiseIteration();
 
+	// Reset oscillation diagnostics
+	corrPrev_.clear();
+	stnOscCount_.clear();
+	oscHistory_.clear();
+
+	// Initialise Anderson acceleration if enabled
+	if (projectSettings_.a.aa_enabled)
+		InitialiseAndersonAcceleration();
+
 	std::string corr_msg;
 	std::ostringstream ss;
 	UINT32 i;
 	bool iterate(true);
 
 	cpu_timer it_time, tot_time;
-		
+
 	// do until convergence criteria is met
 	for (i=0; i<projectSettings_.a.max_iterations; ++i)
 	{
@@ -2550,12 +2620,16 @@ void dna_adjust::AdjustPhased()
 		potentialOutlierCount_ = 0;
 		chiSquaredStage_ = 0.;
 		measurementParams_ = 0;
-	
+
 		// Print the iteration # to adj file
 		printer_->PrintIteration(incrementIteration());
-		
+
 		it_time.start();
 
+		// Snapshot coordinates before iteration for Anderson acceleration
+		if (projectSettings_.a.aa_enabled)
+			ExtractStationCoordinates(aa_x_snapshot_);
+
 		AdjustPhasedForward();
 		if (IsCancelled())
 			break;
@@ -2569,6 +2643,11 @@ void dna_adjust::AdjustPhased()
 
 		// Calculate and print largest adjustment correction and station ID
 		OutputLargestCorrection(corr_msg);
+		UpdateIterationDiagnostics();
+
+		// Apply Anderson acceleration (mixes last m iterates to accelerate convergence)
+		if (projectSettings_.a.aa_enabled)
+			ApplyAndersonAcceleration(i);
 
 		iterationCorrections_.add_message(corr_msg);
 		iterationTimes_.add_message(FormatElapsedTime(it_time.elapsed().wall.count() / 1.0e9));
@@ -2584,7 +2663,7 @@ void dna_adjust::AdjustPhased()
 		// Similar to PrepareAdjustment, UpdateAdjustment prepares every block
 		// in the network so that forward and reverse adjustments can commence
 		// at the same time.
-		UpdateAdjustment(iterate);	
+		UpdateAdjustment(iterate);
 		if (IsCancelled())
 			break;
 
@@ -2593,7 +2672,7 @@ void dna_adjust::AdjustPhased()
 		{
 			// Compute network statistics
 			ComputeStatisticsOnIteration();
-			
+
 			// Print statistics summary to adj file
 			printer_->PrintStatistics(false);
 		}
@@ -2699,9 +2778,10 @@ void dna_adjust::AdjustPhasedForward()
 
 	UINT32 currentBlock(0);
 
-	// For staged adjustments, load the first block from mapped memory file
+	// For staged adjustments, prefetch and load the first block from mapped memory file
 	if (projectSettings_.a.stage)
 	{
+		AdviseBlockWillNeed(currentBlock);
 		DeserialiseBlockFromMappedFile(currentBlock);
 		RebuildNormals(currentBlock, __forward__, true, true);
 		if (IsCancelled())
@@ -2739,16 +2819,24 @@ void dna_adjust::AdjustPhasedForward()
 		// At this point, whether first iteration or not, if currentBlock is the first block, 
 		// the normals will have been initialised.  For all later blocks, the normals will contain
 		// the contribution of junction station coordinates and variances from preceding blocks.
-		// In either case, the block is ready for adjustment.  Junction station coordinates and 
+		// In either case, the block is ready for adjustment.  Junction station coordinates and
 		// variances are carried forward below
 
+		assert(v_design_.at(currentBlock).getbuffer() != nullptr && "AdjFwd: design null before SolveTry");
+		assert(v_AtVinv_.at(currentBlock).getbuffer() != nullptr && "AdjFwd: AtVinv null before SolveTry");
+		assert(v_normals_.at(currentBlock).getbuffer() != nullptr && "AdjFwd: normals null before SolveTry");
+		assert(v_measMinusComp_.at(currentBlock).getbuffer() != nullptr && "AdjFwd: measMinusComp null before SolveTry");
+
 		// Least Squares Solution
-		SolveTry(true, currentBlock);
+		if (projectSettings_.a.lm_enabled)
+			SolveLMTry(true, currentBlock, true);
+		else
+			SolveTry(true, currentBlock);
 
 		// Does the user want to print adjusted measurements
 		// on each iteration?
-		if (projectSettings_.o._adj_msr_iteration || 
-			projectSettings_.o._adj_stn_iteration || 
+		if (projectSettings_.o._adj_msr_iteration ||
+			projectSettings_.o._adj_stn_iteration ||
 			projectSettings_.o._cmp_msr_iteration)
 			adj_file << " done." << std::endl;
 
@@ -2770,21 +2858,21 @@ void dna_adjust::AdjustPhasedForward()
 		// OK, now shrink matrices back to normal size
 		ShrinkForwardMatrices(currentBlock);
 
-		// Carry the estimated junction station coordinates and variances 
+		// Carry the estimated junction station coordinates and variances
 		// to the next block for applicable blocks only.  For staged
-		// Deserialise the next block from mapped files and rebuild 
+		// Deserialise the next block from mapped files and rebuild
 		// normals
 		CarryForwardJunctions(currentBlock, currentBlock+1);
 
 		// For staged adjustments, write to disk and unload matrix data
 		if (projectSettings_.a.stage)
-			// Don't offload last block since the reverse adjustment 
+			// Don't offload last block since the reverse adjustment
 			// will need this block
 			if (currentBlock < (blockCount_ - 1))
 				OffloadBlockToMappedFile(currentBlock);
 	}
 }
-	
+
 
 void dna_adjust::PurgeMatricesFromDisk()
 {
@@ -2889,8 +2977,7 @@ void dna_adjust::PrepareAdjustmentBlock(const UINT32 block, const UINT32 thread_
 		v_normalsR_.at(block) = v_normals_.at(block);
 
 		if (projectSettings_.a.multi_thread) {
-			v_normalsRC_.at(block).redim(
-				v_normalsR_.at(block).rows(), v_normalsR_.at(block).columns());
+			v_normalsRC_.at(block).redim_packed(v_normalsR_.at(block).rows());
 		}
 
 		switch (projectSettings_.a.adjust_mode)
@@ -2944,6 +3031,8 @@ void dna_adjust::ShrinkForwardMatrices(const UINT32 currentBlock)
 	{
 		pseudoMsrElemCount = static_cast<UINT32>(v_JSL_.at(currentBlock-1).size() * 3);
 
+		assert(v_measMinusComp_.at(currentBlock).getbuffer() != nullptr && "ShrinkFwd: measMinusComp null");
+		assert(v_AtVinv_.at(currentBlock).getbuffer() != nullptr && "ShrinkFwd: AtVinv null");
 		v_measMinusComp_.at(currentBlock).shrink(pseudoMsrElemCount, 0);
 		v_AtVinv_.at(currentBlock).shrink(0, pseudoMsrElemCount);
 	}
@@ -2978,7 +3067,9 @@ void dna_adjust::UpdateEstimatesForward(const UINT32 currentBlock)
 
 		// Now copy 'estimated' coordinates to 'rigorous' for comparison on the next iteration
 		v_rigorousStations_.at(currentBlock) = v_estimatedStations_.at(currentBlock);
+		assert(v_normals_.at(currentBlock).getbuffer() != nullptr && "UpdateEstFwd: normals null before copy to rigorousVariances");
 		v_rigorousVariances_.at(currentBlock) = v_normals_.at(currentBlock);
+		assert(v_rigorousVariances_.at(currentBlock).getbuffer() != nullptr && "UpdateEstFwd: rigorousVariances null after copy");
 
 		// Temporarily hold corrections for last block.  These corrections will be restored
 		// in UpdateEstimatesFinal()
@@ -3028,9 +3119,10 @@ void dna_adjust::CarryForwardJunctions(const UINT32 thisBlock, const UINT32 next
 	if (v_blockMeta_.at(nextBlock)._blockIsolated)
 		return;
 	
-	// For staged adjustments, load block info for nextBlock
+	// For staged adjustments, prefetch and load block info for nextBlock
 	if (projectSettings_.a.stage)
 	{
+		AdviseBlockWillNeed(nextBlock);
 		DeserialiseBlockFromMappedFile(nextBlock);
 		RebuildNormals(nextBlock, __forward__, true, true);
 	}
@@ -3065,7 +3157,7 @@ bool dna_adjust::PrepareAdjustmentReverse(const UINT32 currentBlock, bool MT_Rev
 
 	// OK. currentBlock is the last block, and this is the commencement of
 	// a reverse run, so reset coordinates
-	matrix_2d* estimatedStations(&v_estimatedStations_.at(currentBlock));	
+	matrix_2d* estimatedStations(&v_estimatedStations_.at(currentBlock));
 
 	if (MT_ReverseOrCombine)
 	{
@@ -3073,15 +3165,23 @@ bool dna_adjust::PrepareAdjustmentReverse(const UINT32 currentBlock, bool MT_Rev
 	}
 	else
 	{
-		// Restore back up copy of normals for reverse adjustment in 
+		// Restore back up copy of normals for reverse adjustment in
 		// single thread mode
+		assert(v_normalsR_.at(currentBlock).getbuffer() != nullptr && "PrepareAdjReverse: normalsR null before copy");
+		assert(v_normalsR_.at(currentBlock).rows() > 0 && "PrepareAdjReverse: normalsR has 0 rows");
 		v_normals_.at(currentBlock) = v_normalsR_.at(currentBlock);
+		assert(v_normals_.at(currentBlock).getbuffer() != nullptr && "PrepareAdjReverse: normals null after copy");
 
 	}
 
+	assert(v_originalStations_.at(currentBlock).getbuffer() != nullptr && "PrepareAdjReverse: originalStations null");
 	// Reset coordinates to original values
 	*estimatedStations = v_originalStations_.at(currentBlock);
-	
+
+	assert(v_design_.at(currentBlock).rows() > 0 && "PrepareAdjReverse: design has 0 rows");
+	assert(v_AtVinv_.at(currentBlock).rows() > 0 && "PrepareAdjReverse: AtVinv has 0 rows");
+	assert(v_measMinusComp_.at(currentBlock).rows() > 0 && "PrepareAdjReverse: measMinusComp has 0 rows");
+
 	AddConstraintStationstoNormalsReverse(currentBlock, MT_ReverseOrCombine);
 
 	return true;
@@ -3421,19 +3521,27 @@ void dna_adjust::AdjustPhasedReverseCombine()
 		if (projectSettings_.g.verbose > 3)
 			debug_file << "In isolation" << std::endl;
 
+		assert(v_design_.at(currentBlock).getbuffer() != nullptr && "design null before SolveTry (reverse combine)");
+		assert(v_AtVinv_.at(currentBlock).getbuffer() != nullptr && "AtVinv null before SolveTry (reverse combine)");
+		assert(v_normals_.at(currentBlock).getbuffer() != nullptr && "normals null before SolveTry (reverse combine)");
+		assert(v_measMinusComp_.at(currentBlock).getbuffer() != nullptr && "measMinusComp null before SolveTry (reverse combine)");
+
 		// Backup normals prior to inversion for re-use in combination
 		// adjustment... only if a combination is required
 		BackupNormals(currentBlock, false);
 
 		// Least Squares Solution
-		SolveTry(true, currentBlock);
+		if (projectSettings_.a.lm_enabled)
+			SolveLMTry(true, currentBlock, true);
+		else
+			SolveTry(true, currentBlock);
 
 		if (projectSettings_.o._adj_stn_iteration)
 			adj_file << " done." << std::endl;
 
 		// Add corrections to estimates.
 		// If --output-adj-iter-stat argument is supplied or currentBlock
-		// is the first block, then compute geographic coordinates, recompute 
+		// is the first block, then compute geographic coordinates, recompute
 		// meas-minus-computed vector, then print statistics.
 		// Output of stats also prints largest correction.
 		// If --output-adj-iter-stn argument is supplied, print station coords
@@ -3441,16 +3549,16 @@ void dna_adjust::AdjustPhasedReverseCombine()
 
 		// Debug and diagnose (if required)
 		debug_BlockInformation(currentBlock, "(reverse, in isolation)");
-		
+
 		// Carry the estimated junction station coordinates and variances to the next block and
 		// combine.  CarryReverseEstimates will return false if currentBlock is the first block
-		// or an isolated block.  If estimates can be carried, CarryReverseEstimates updates 
+		// or an isolated block.  If estimates can be carried, CarryReverseEstimates updates
 		// geographic coords (if req'd), recomputes meas-minus-comp vector and updates normals for
 		// the next block.
 		if (CarryReverseJunctions(currentBlock, currentBlock-1, false))
-		{			
+		{
 			//////////////////////////////////////////////////////////////////////////
-			// Combination Adjustment 
+			// Combination Adjustment
 			//
 			// Perform combination on all blocks except the first and last as they will
 			// be rigorous from reverse and forward adjustments respectively
@@ -3458,19 +3566,22 @@ void dna_adjust::AdjustPhasedReverseCombine()
 			// First, carry the junction station estimates and variances of the next block
 			// (obtained during the forward pass).  These were copied to v_junctionEstimatesFwd_
 			// and v_junctionVariances_ in CarryStnEstimatesandVariancesForward(..) during
-			// the forward pass. 
+			// the forward pass.
 			if (PrepareAdjustmentCombine(currentBlock, pseudomsrJSLCount, false))
 			{
 				isCombining_ = true;
 				if (projectSettings_.g.verbose > 3)
 					debug_file << "Rigorous" << std::endl;
-			
+
 				if (projectSettings_.o._adj_stn_iteration)
 					if (!v_blockMeta_.at(currentBlock)._blockFirst)
 						adj_file << std::endl << std::left << "Adjusting block " << currentBlock+1 << " (reverse, rigorous)... ";
 
 				// Least Squares Solution
-				SolveTry(true, currentBlock);
+				if (projectSettings_.a.lm_enabled)
+					SolveLMTry(true, currentBlock, true);
+				else
+					SolveTry(true, currentBlock);
 
 				if (projectSettings_.o._adj_stn_iteration)
 					adj_file << " done." << std::endl;
@@ -3505,7 +3616,7 @@ void dna_adjust::AdjustPhasedReverseCombine()
 
 	}	// for (UINT32 block=0; block<blockCount_; ++block, --currentBlock)
 }
-	
+
 
 // Single thread reverse and combination adjustment
 void dna_adjust::AdjustPhasedReverse()
@@ -3514,9 +3625,12 @@ void dna_adjust::AdjustPhasedReverse()
 	isCombining_ = false;
 	UINT32 currentBlock(blockCount_ - 1);
 
-	// For staged adjustments, load the last block from mapped memory file
+	// For staged adjustments, prefetch and load the last block from mapped memory file
 	if (projectSettings_.a.stage)
+	{
+		AdviseBlockWillNeed(currentBlock);
 		DeserialiseBlockFromMappedFile(currentBlock);
+	}
 
 	for (UINT32 block=0; block<blockCount_; ++block, --currentBlock)
 	{
@@ -3553,14 +3667,17 @@ void dna_adjust::AdjustPhasedReverse()
 			debug_file << "In isolation" << std::endl;
 
 		// Least Squares Solution
-		SolveTry(true, currentBlock);
+		if (projectSettings_.a.lm_enabled)
+			SolveLMTry(true, currentBlock, true);
+		else
+			SolveTry(true, currentBlock);
 
 		if (projectSettings_.o._adj_stn_iteration)
 			adj_file << " done." << std::endl;
-		
+
 		// Add corrections to estimates.
 		// If --output-adj-iter-stat argument is supplied or currentBlock
-		// is the first block, then compute geographic coordinates, recompute 
+		// is the first block, then compute geographic coordinates, recompute
 		// meas-minus-computed vector, then print statistics.
 		// Output of stats also prints largest correction.
 		// If --output-adj-iter-stn argument is supplied, print station coords
@@ -3571,7 +3688,7 @@ void dna_adjust::AdjustPhasedReverse()
 
 		// Carry the estimated junction station coordinates and variances to the next block and
 		// combine.  CarryReverseEstimates will return false if currentBlock is the first block
-		// or an isolated block.  If estimates can be carried, CarryReverseEstimates updates 
+		// or an isolated block.  If estimates can be carried, CarryReverseEstimates updates
 		// geographic coords (if req'd), recomputes meas-minus-comp vector and updates normals for
 		// the next block.
 		CarryReverseJunctions(currentBlock, currentBlock-1, false);
@@ -3585,9 +3702,9 @@ void dna_adjust::AdjustPhasedReverse()
 
 	}	// for (UINT32 block=0; block<blockCount_; ++block, --currentBlock)
 }
-	
 
-// This method will not be reached if currentBlock is a single (isolated) block, since 
+
+// This method will not be reached if currentBlock is a single (isolated) block, since
 // PrepareAdjustmentReverse performs this test.
 void dna_adjust::UpdateEstimatesReverse(const UINT32 currentBlock, bool MT_ReverseOrCombine)
 {
@@ -3682,7 +3799,11 @@ void dna_adjust::UpdateEstimatesFinal(const UINT32 currentBlock)
 		AtVinv = &v_AtVinvR_.at(currentBlock);
 		measMinusComp = &v_measMinusCompR_.at(currentBlock);
 		aposterioriVariances = &v_normalsR_.at(currentBlock);
-	}	
+	}
+
+	assert(AtVinv->getbuffer() != nullptr && "UpdateEstFinal: AtVinv null");
+	assert(measMinusComp->getbuffer() != nullptr && "UpdateEstFinal: measMinusComp null");
+	assert(aposterioriVariances->getbuffer() != nullptr && "UpdateEstFinal: aposterioriVariances null");
 
 	if (v_blockMeta_.at(currentBlock)._blockFirst)
 	{
@@ -3762,13 +3883,16 @@ bool dna_adjust::CarryReverseJunctions(const UINT32 currentBlock, const UINT32 n
 			v_normals_.at(nextBlock) = v_normalsR_.at(nextBlock);
 	}
 
-	// For staged adjustments, load blocks info for nextBlock
+	// For staged adjustments, prefetch and load blocks info for nextBlock
 	if (projectSettings_.a.stage)
+	{
+		AdviseBlockWillNeed(nextBlock);
 		DeserialiseBlockFromMappedFile(nextBlock);
+	}
 
 	// Next, update estimates for next block to original estimates
 	*estimatedStationsNext = v_originalStations_.at(nextBlock);
-	
+
 	// For staged adjustments, rebuild design and AtVinv
 	if (projectSettings_.a.stage)
 		RebuildNormals(nextBlock, __reverse__, false, false);
@@ -3792,25 +3916,25 @@ bool dna_adjust::CarryReverseJunctions(const UINT32 currentBlock, const UINT32 n
 
 
 // go through each of the measurements in the binary measurements file and formulate partial derivatives
-void dna_adjust::FillDesignNormalMeasurementsMatrices(bool buildnewMatrices, const UINT32& block, bool MT_ReverseOrCombine)
+void dna_adjust::FillDesignNormalMeasurementsMatrices(bool buildnewMatrices, const UINT32& block, bool MT_ReverseOrCombine, bool skipNormals)
 {
 	UINT32 design_row(0);
-	
+
 	it_vUINT32 _it_block_msr;
-	it_vmsr_t _it_msr; 
+	it_vmsr_t _it_msr;
 
 	for (_it_block_msr=v_CML_.at(block).begin(); _it_block_msr!=v_CML_.at(block).end(); ++_it_block_msr)
 	{
 		if (InitialiseandValidateMsrPointer(_it_block_msr, _it_msr))
 			continue;
 
-		// When a target direction is found, continue to next element.  
+		// When a target direction is found, continue to next element.
 		if (_it_msr->measType == 'D')
 			if (_it_msr->vectorCount2 < 1)
 				continue;
 
 		// Build AtVinv, Normals and Meas minus Comp vectors
-		UpdateDesignNormalMeasMatrices(&_it_msr, design_row, buildnewMatrices, block, MT_ReverseOrCombine);
+		UpdateDesignNormalMeasMatrices(&_it_msr, design_row, buildnewMatrices, block, MT_ReverseOrCombine, skipNormals);
 	}
 }
 
@@ -3819,6 +3943,9 @@ void dna_adjust::FillDesignNormalMeasurementsMatrices(bool buildnewMatrices, con
 // pre-adjustment value, otherwise back up the current value.
 bool dna_adjust::InitialiseMeasurement(pit_vmsr_t _it_msr, bool buildnewMatrices)
 {
+	if (rebuildingDesign_)
+		return false;
+
 	if (buildnewMatrices)
 	{
 		// Was this measurement reduced from previous
@@ -3839,7 +3966,7 @@ bool dna_adjust::InitialiseMeasurement(pit_vmsr_t _it_msr, bool buildnewMatrices
 }
 	
 
-void dna_adjust::UpdateDesignNormalMeasMatrices(pit_vmsr_t _it_msr, UINT32& design_row, bool buildnewMatrices, const UINT32& block, bool MT_ReverseOrCombine)
+void dna_adjust::UpdateDesignNormalMeasMatrices(pit_vmsr_t _it_msr, UINT32& design_row, bool buildnewMatrices, const UINT32& block, bool MT_ReverseOrCombine, bool skipNormals)
 {
 	std::stringstream ss;
 
@@ -3855,101 +3982,101 @@ void dna_adjust::UpdateDesignNormalMeasMatrices(pit_vmsr_t _it_msr, UINT32& desi
 		design = &v_designR_.at(block);
 		AtVinv = &v_AtVinvR_.at(block);
 		measMinusComp = &v_measMinusCompR_.at(block);
-	}	
+	}
 
 	switch ((*_it_msr)->measType)
 	{
 	case 'A':	// Horizontal angle
 		UpdateDesignNormalMeasMatrices_A(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 		break;
 	case 'B':	// Geodetic azimuth
 	case 'K':	// Astronomic azimuth
-		// Note: UpdateDesignNormalMeasMatrices_BK reduces K measurements to the geodetic reference frame, 
+		// Note: UpdateDesignNormalMeasMatrices_BK reduces K measurements to the geodetic reference frame,
 		// after which UpdateDesignNormalMeasMatrices_BK treats K measurements as B measurements upon forming
 		// design matrix elements.
 		UpdateDesignNormalMeasMatrices_BK(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);	
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 		break;
 	case 'C':	// Chord dist
 		UpdateDesignNormalMeasMatrices_C(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 		break;
-	case 'D':	// Direction set	
+	case 'D':	// Direction set
 		UpdateDesignNormalMeasMatrices_D(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 		break;
 	case 'E':	// Ellipsoid arc
 		UpdateDesignNormalMeasMatrices_E(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 		break;
 	case 'G':	// GPS Baseline
 		UpdateDesignNormalMeasMatrices_G(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 		break;
 	case 'H':	// Orthometric height
 		// Note: UpdateDesignNormalMeasMatrices_H reduces term1 to ellipsoid height, after which
 		// UpdateDesignNormalMeasMatrices_HR is used to form design elements.
 		UpdateDesignNormalMeasMatrices_H(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 		break;
 	case 'I':	// Astronomic latitude
 		// Note: UpdateDesignNormalMeasMatrices_I reduces term1 to geodetic latitude, after which
 		// UpdateDesignNormalMeasMatrices_IP is used to form design elements.
 		UpdateDesignNormalMeasMatrices_I(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 		break;
 	case 'J':	// Astronomic longitude
 		// Note: UpdateDesignNormalMeasMatrices_J reduces term1 to geodetic longitude, after which
 		// UpdateDesignNormalMeasMatrices_JQ is used to form design elements.
 		UpdateDesignNormalMeasMatrices_J(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
-		break;		
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
+		break;
 	case 'L':	// Level difference
 		UpdateDesignNormalMeasMatrices_L(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
-		break;		
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
+		break;
 	case 'M':	// MSL arc
 		UpdateDesignNormalMeasMatrices_M(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
-		break;		
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
+		break;
 	case 'P':	// Geodetic latitude
 		// Note: UpdateDesignNormalMeasMatrices_P archives the raw measurement, after which
 		// UpdateDesignNormalMeasMatrices_IP is used to form design elements.
 		UpdateDesignNormalMeasMatrices_P(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
-		break;		
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
+		break;
 	case 'Q':	// Geodetic longitude
 		// Note: UpdateDesignNormalMeasMatrices_Q archives the raw measurement, after which
 		// UpdateDesignNormalMeasMatrices_JQ is used to form design elements.
 		UpdateDesignNormalMeasMatrices_Q(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
-		break;		
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
+		break;
 	case 'R':	// Ellipsoidal height
 		// Note: UpdateDesignNormalMeasMatrices_R archives the raw measurement, after which
 		// UpdateDesignNormalMeasMatrices_HR is used to form design elements.
 		UpdateDesignNormalMeasMatrices_R(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 		break;
 	case 'S':	// Slope distance
 		UpdateDesignNormalMeasMatrices_S(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 		break;
 	case 'V':	// Zenith distance
 		UpdateDesignNormalMeasMatrices_V(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
-		break;		
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
+		break;
 	case 'X':	// GPS Baseline cluster
 		UpdateDesignNormalMeasMatrices_X(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, design, AtVinv, buildnewMatrices);
-		break;		
+			measMinusComp, estimatedStations, design, AtVinv, buildnewMatrices, skipNormals);
+		break;
 	case 'Y':	// GPS Point cluster
 		UpdateDesignNormalMeasMatrices_Y(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, design, AtVinv, buildnewMatrices);
-		break;		
+			measMinusComp, estimatedStations, design, AtVinv, buildnewMatrices, skipNormals);
+		break;
 	case 'Z':	// Vertical angle
 		UpdateDesignNormalMeasMatrices_Z(_it_msr, design_row, block,
-			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+			measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 		break;
 	default:
 		ss << "UpdateDesignNormalMeasMatrices(): Unknown measurement type - '" <<
@@ -4656,8 +4783,8 @@ void dna_adjust::AddMsrtoMeasMinusComp(pit_vmsr_t _it_msr, const UINT32& design_
 	
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_A(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr)); 
 	UINT32 stn2(GetBlkMatrixElemStn2(block, _it_msr));
@@ -4683,11 +4810,11 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_A(pit_vmsr_t _it_msr, UINT32& de
 		&direction12, &direction13, 
 		&local_12e, &local_12n, &local_13e, &local_13n));
 	
-	// Initialise measurement.  No need to test if no further calculations are 
+	// Initialise measurement.  No need to test if no further calculations are
 	// required (as in stage mode), as this is done later (below)
 	InitialiseMeasurement(_it_msr, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !rebuildingDesign_)
 	{
 		// deflections available?
 		if (fabs(stn1_it->verticalDef) > E4_SEC_DEFLECTION || fabs(stn1_it->meridianDef) > E4_SEC_DEFLECTION)
@@ -4806,7 +4933,7 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_A(pit_vmsr_t _it_msr, UINT32& de
 
 	UpdateAtVinv(_it_msr, stn1, stn2, stn3, design_row, design, AtVinv, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !skipNormals)
 		// Add weighted measurement contributions to normal matrix
 		UpdateNormals_A(stn1, stn2, stn3, design_row, normals, design, AtVinv);
 	else
@@ -4815,8 +4942,8 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_A(pit_vmsr_t _it_msr, UINT32& de
 
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_BK(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr)); 
 	UINT32 stn2(GetBlkMatrixElemStn2(block, _it_msr));
@@ -4838,11 +4965,11 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_BK(pit_vmsr_t _it_msr, UINT32& d
 		stn1_it->currentLongitude,
 		&local_12e, &local_12n));
 	
-	// Initialise measurement.  No need to test if no further calculations are 
+	// Initialise measurement.  No need to test if no further calculations are
 	// required (as in stage mode), as this is done later (below)
 	InitialiseMeasurement(_it_msr, buildnewMatrices);
-	
-	if (buildnewMatrices)
+
+	if (buildnewMatrices && !rebuildingDesign_)
 	{
 		// deflections available?
 		if ((*_it_msr)->measType == 'K' &&						// Astro
@@ -4910,19 +5037,19 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_BK(pit_vmsr_t _it_msr, UINT32& d
 	// Update AtVinv based on new design matrix elements
 	UpdateAtVinv(_it_msr, stn1, stn2, 0, design_row, design, AtVinv, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !skipNormals)
 		// Add weighted measurement contributions to normal matrix
 		UpdateNormals_BCEKLMSVZ(stn1, stn2, design_row, normals, design, AtVinv);
 	else
 		design_row++;
 }
-	
+
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_C(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
-	// Initialise measurement and test if no further calculations are 
+	// Initialise measurement and test if no further calculations are
 	// required (as in stage mode)
 	if (InitialiseMeasurement(_it_msr, buildnewMatrices))
 		return;
@@ -4933,13 +5060,13 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_C(pit_vmsr_t _it_msr, UINT32& de
 
 	// Now call UpdateDesignNormalMeasMatrices_CEM
 	UpdateDesignNormalMeasMatrices_CEM(_it_msr, design_row, block,
-		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 }
 	
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_CEM(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr)); 
 	UINT32 stn2(GetBlkMatrixElemStn2(block, _it_msr));
@@ -4975,17 +5102,17 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_CEM(pit_vmsr_t _it_msr, UINT32&
 	// Update AtVinv based on new design matrix elements
 	UpdateAtVinv(_it_msr, stn1, stn2, 0, design_row, design, AtVinv, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !skipNormals)
 		// Add weighted measurement contributions to normal matrix
 		UpdateNormals_BCEKLMSVZ(stn1, stn2, design_row, normals, design, AtVinv);
 	else
 		design_row++;
 }
-	
+
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_D(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	it_vmsr_t _it_msr_first(*_it_msr);
 	UINT32 design_row_begin(design_row);
@@ -5034,7 +5161,7 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_D(pit_vmsr_t _it_msr, UINT32& de
 
 			it_angle->station3 = (*_it_msr)->station2;
 			
-			if (buildnewMatrices)
+			if (buildnewMatrices && !rebuildingDesign_)
 			{
 				// Was this measurement reduced from previous
 				// adjustment, which was serialised to disk?
@@ -5067,7 +5194,7 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_D(pit_vmsr_t _it_msr, UINT32& de
 			UpdateDesignNormalMeasMatrices_A(&it_angle, design_row, block,
 				measMinusComp, estimatedStations, 0, design, AtVinv, buildnewMatrices);
 
-			if (buildnewMatrices)
+			if (buildnewMatrices && !rebuildingDesign_)
 			{
 				// Update derived angle, corrected for deflection of vertical
 				(*_it_msr)->scale1 = it_angle->term1;
@@ -5138,14 +5265,14 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_D(pit_vmsr_t _it_msr, UINT32& de
 		it_angle++;
 	}
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !skipNormals)
 		// Add weighted measurement contributions to normal matrix
 		UpdateNormals_D(block, _it_msr_first, design_row_begin, normals, design, AtVinv);
 }
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_E(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	// Initialise measurement and test if no further calculations are 
 	// required (as in stage mode)
@@ -5181,7 +5308,7 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_E(pit_vmsr_t _it_msr, UINT32& de
 
 	// Now that the ellipsoid arc has been reduced to a chord, call UpdateDesignNormalMeasMatrices_CEM
 	UpdateDesignNormalMeasMatrices_CEM(_it_msr, design_row, block,
-		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 }
 
 void dna_adjust::UpdateDesignMeasMatrices_GX(pit_vmsr_t _it_msr, UINT32& design_row,
@@ -5255,8 +5382,8 @@ void dna_adjust::UpdateDesignMeasMatrices_GX(pit_vmsr_t _it_msr, UINT32& design_
 }
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_G(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	it_vmsr_t _it_msr_first(*_it_msr);
 	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr)); 
@@ -5278,7 +5405,7 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_G(pit_vmsr_t _it_msr, UINT32& de
 
 
 	// For first time run or staged adjustment mode
-	if (buildnewMatrices || projectSettings_.a.stage)
+	if (buildnewMatrices || projectSettings_.a.stage || skipNormals)
 	{
 		// Load GPS Variance matrix.  For the first run, scaling is applied
 		// and the variances are written to the binary measurements list.
@@ -5287,23 +5414,23 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_G(pit_vmsr_t _it_msr, UINT32& de
 
 		if (buildnewMatrices && projectSettings_.a.stage)
 			return;
-	
+
 		// Build  At * V-1
 		AtVinv->replace(stn1, design_row_begin, var_cart * -1);
 		AtVinv->replace(stn2, design_row_begin, var_cart);
 
-		if (buildnewMatrices && !projectSettings_.a.stage)
+		if (buildnewMatrices && !projectSettings_.a.stage && !skipNormals)
 			// Add weighted measurement contributions to normal matrix
 			UpdateNormals_G(stn1, stn2, design_row_begin, normals, AtVinv);
 	}
-	
+
 	design_row++;
 }
-	
+
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_M(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	// Initialise measurement and test if no further calculations are 
 	// required (as in stage mode)
@@ -5328,22 +5455,22 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_M(pit_vmsr_t _it_msr, UINT32& de
 
 	// Now that the MSL arc has been reduced to a chord, call UpdateDesignNormalMeasMatrices_CEM
 	UpdateDesignNormalMeasMatrices_CEM(_it_msr, design_row, block,
-		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 }
 
 // Like zenith distances and vertical angles, the relationship between slope distances and the
 // coordinates of p1 and p2 requires a little extra work to take into consideration instrument
 // and target height.  For the measurements-minus-computed vector, the "computed" distance
-// is the true distance between the instrument and target, and so must take into consideration 
+// is the true distance between the instrument and target, and so must take into consideration
 // instrument and target heights.  However, the dX, dY, dZ components for the partial 
 // derivatives represent the true geometric difference between the two stations (not 
 // instrument and target).
 void dna_adjust::UpdateDesignNormalMeasMatrices_S(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	// preAdjMeas is used to store original measured MSL arc distance
-	if (buildnewMatrices)
+	if (buildnewMatrices && !rebuildingDesign_)
 	{
 		// This is the first time adjust has run, so
 		// back up the raw measurement
@@ -5353,7 +5480,7 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_S(pit_vmsr_t _it_msr, UINT32& de
 			return;
 	}
 
-	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr)); 
+	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr));
 	UINT32 stn2(GetBlkMatrixElemStn2(block, _it_msr));
 
 	it_vstn_t_const stn1_it(bstBinaryRecords_.begin() + (*_it_msr)->station1);
@@ -5361,7 +5488,7 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_S(pit_vmsr_t _it_msr, UINT32& de
 	// compute dX, dY, dZ for instrument height (ih) and target height (th)
 	double dXih, dYih, dZih, dXth, dYth, dZth;
 	CartesianElementsFromInstrumentHeight((*_it_msr)->term3,				// instrument height
-		&dXih, &dYih, &dZih, 
+		&dXih, &dYih, &dZih,
 		stn1_it->currentLatitude,
 		stn1_it->currentLongitude);
 	CartesianElementsFromInstrumentHeight((*_it_msr)->term4,				// target height
@@ -5389,7 +5516,7 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_S(pit_vmsr_t _it_msr, UINT32& de
 	// Update AtVinv based on new design matrix elements
 	UpdateAtVinv(_it_msr, stn1, stn2, 0, design_row, design, AtVinv, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !skipNormals)
 		// Add weighted measurement contributions to normal matrix
 		UpdateNormals_BCEKLMSVZ(stn1, stn2, design_row, normals, design, AtVinv);
 	else
@@ -5406,10 +5533,10 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_S(pit_vmsr_t _it_msr, UINT32& de
 // partial derivatives represent the true geometric difference between the two stations (not 
 // instrument and target).
 void dna_adjust::UpdateDesignNormalMeasMatrices_V(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
-	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr)); 
+	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr));
 	UINT32 stn2(GetBlkMatrixElemStn2(block, _it_msr));
 
 	it_vstn_t_const stn1_it(bstBinaryRecords_.begin() + (*_it_msr)->station1);
@@ -5417,11 +5544,11 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_V(pit_vmsr_t _it_msr, UINT32& de
 
 	double local_12e, local_12n, local_12up;
 
-	// Initialise measurement.  No need to test if no further calculations are 
+	// Initialise measurement.  No need to test if no further calculations are
 	// required (as in stage mode), as this is done later (below)
 	InitialiseMeasurement(_it_msr, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !rebuildingDesign_)
 	{
 		// deflections available?
 		if (fabs(stn1_it->verticalDef) > E4_SEC_DEFLECTION || fabs(stn1_it->meridianDef) > E4_SEC_DEFLECTION)
@@ -5497,7 +5624,7 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_V(pit_vmsr_t _it_msr, UINT32& de
 	// Update AtVinv based on new design matrix elements
 	UpdateAtVinv(_it_msr, stn1, stn2, 0, design_row, design, AtVinv, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !skipNormals)
 		// Add weighted measurement contributions to normal matrix
 		UpdateNormals_BCEKLMSVZ(stn1, stn2, design_row, normals, design, AtVinv);
 	else
@@ -5510,15 +5637,15 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_V(pit_vmsr_t _it_msr, UINT32& de
 // Like zenith distances, the relationship between slope distances and the
 // coordinates of p1 and p2 requires a little extra work to take into consideration instrument
 // and target height.  For the measurements-minus-computed vector, the "computed" distance
-// is the true distance between the instrument and target, and so must take into consideration 
-// instrument and target heights.  However, the dX, dY, dZ components for the partial 
-// derivatives represent the true geometric difference between the two stations (not 
+// is the true distance between the instrument and target, and so must take into consideration
+// instrument and target heights.  However, the dX, dY, dZ components for the partial
+// derivatives represent the true geometric difference between the two stations (not
 // instrument and target).
 void dna_adjust::UpdateDesignNormalMeasMatrices_Z(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
-	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr)); 
+	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr));
 	UINT32 stn2(GetBlkMatrixElemStn2(block, _it_msr));
 
 	it_vstn_t_const stn1_it(bstBinaryRecords_.begin() + (*_it_msr)->station1);
@@ -5526,11 +5653,11 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_Z(pit_vmsr_t _it_msr, UINT32& de
 
 	double local_12e, local_12n, local_12up;
 
-	// Initialise measurement.  No need to test if no further calculations are 
+	// Initialise measurement.  No need to test if no further calculations are
 	// required (as in stage mode), as this is done later (below)
 	InitialiseMeasurement(_it_msr, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !rebuildingDesign_)
 	{
 		// deflections available?
 		if (fabs(stn1_it->verticalDef) > E4_SEC_DEFLECTION || fabs(stn1_it->meridianDef) > E4_SEC_DEFLECTION)
@@ -5606,21 +5733,21 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_Z(pit_vmsr_t _it_msr, UINT32& de
 	// Update AtVinv based on new design matrix elements
 	UpdateAtVinv(_it_msr, stn1, stn2, 0, design_row, design, AtVinv, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !skipNormals)
 		// Add weighted measurement contributions to normal matrix
 		UpdateNormals_BCEKLMSVZ(stn1, stn2, design_row, normals, design, AtVinv);
 	else
 		design_row++;
 }
-	
+
 
 // Orthometric height difference.
-// This function requires geoid-ellipsoid separation in order to reduce 
-// to ellipsoidal height difference counterpart.  
+// This function requires geoid-ellipsoid separation in order to reduce
+// to ellipsoidal height difference counterpart.
 // Hence, run geoid with -f, -s and -n options
 void dna_adjust::UpdateDesignNormalMeasMatrices_L(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr)); 
 	UINT32 stn2(GetBlkMatrixElemStn2(block, _it_msr));
@@ -5645,11 +5772,11 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_L(pit_vmsr_t _it_msr, UINT32& de
 		&h1, &h2, &nu1, &nu2, &Zn1, &Zn2, 
 		datum_.GetEllipsoidRef()));
 
-	// Initialise measurement.  No need to test if no further calculations are 
+	// Initialise measurement.  No need to test if no further calculations are
 	// required (as in stage mode), as this is done later (below)
 	InitialiseMeasurement(_it_msr, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !rebuildingDesign_)
 	{
 		// N value available?
 		if (fabs(stn1_it->geoidSep) > PRECISION_1E4 ||
@@ -5680,7 +5807,7 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_L(pit_vmsr_t _it_msr, UINT32& de
 	// Update AtVinv based on new design matrix elements
 	UpdateAtVinv(_it_msr, stn1, stn2, 0, design_row, design, AtVinv, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !skipNormals)
 		// Add weighted measurement contributions to normal matrix
 		UpdateNormals_BCEKLMSVZ(stn1, stn2, design_row, normals, design, AtVinv);
 	else
@@ -5688,17 +5815,17 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_L(pit_vmsr_t _it_msr, UINT32& de
 }
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_I(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
-	// Initialise measurement.  No need to test if no further calculations are 
+	// Initialise measurement.  No need to test if no further calculations are
 	// required (as in stage mode), as this is done later (below)
 	InitialiseMeasurement(_it_msr, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !rebuildingDesign_)
 	{
 		it_vstn_t_const stn1_it(bstBinaryRecords_.begin() + (*_it_msr)->station1);
-	
+
 		// deflections available?
 		if (fabs(stn1_it->meridianDef) > E4_SEC_DEFLECTION)
 		{
@@ -5713,22 +5840,22 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_I(pit_vmsr_t _it_msr, UINT32& de
 	}    
 
 	UpdateDesignNormalMeasMatrices_IP(_it_msr, design_row, block,
-		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 }
 
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_J(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
-	// Initialise measurement.  No need to test if no further calculations are 
+	// Initialise measurement.  No need to test if no further calculations are
 	// required (as in stage mode), as this is done later (below)
 	InitialiseMeasurement(_it_msr, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !rebuildingDesign_)
 	{
 		it_vstn_t_const stn1_it(bstBinaryRecords_.begin() + (*_it_msr)->station1);
-	
+
 		// deflections available?
 		if (fabs(stn1_it->verticalDef) > E4_SEC_DEFLECTION)
 		{
@@ -5744,27 +5871,27 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_J(pit_vmsr_t _it_msr, UINT32& de
 	}    
 
 	UpdateDesignNormalMeasMatrices_JQ(_it_msr, design_row, block,
-		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 }
 
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_P(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
-	// Initialise measurement and test if no further calculations are 
+	// Initialise measurement and test if no further calculations are
 	// required (as in stage mode)
 	if (InitialiseMeasurement(_it_msr, buildnewMatrices))
 		return;
 
 	UpdateDesignNormalMeasMatrices_IP(_it_msr, design_row, block,
-		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 }
-	
+
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_IP(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr)); 
 	
@@ -5810,31 +5937,31 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_IP(pit_vmsr_t _it_msr, UINT32& d
 	// Update AtVinv based on new design matrix elements
 	UpdateAtVinv(_it_msr, stn1, 0, 0, design_row, design, AtVinv, buildnewMatrices);
 	
-	if (buildnewMatrices)
+	if (buildnewMatrices && !skipNormals)
 		// Add weighted measurement contributions to normal matrix
 		UpdateNormals_HIJPQR(stn1, design_row, normals, design, AtVinv);
 	else
 		design_row++;
 }
-	
+
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_Q(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
-	// Initialise measurement and test if no further calculations are 
+	// Initialise measurement and test if no further calculations are
 	// required (as in stage mode)
 	if (InitialiseMeasurement(_it_msr, buildnewMatrices))
 		return;
 
 	UpdateDesignNormalMeasMatrices_JQ(_it_msr, design_row, block,
-		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 }
-	
+
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_JQ(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr)); 
 	
@@ -5862,26 +5989,26 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_JQ(pit_vmsr_t _it_msr, UINT32& d
 	// Update AtVinv based on new design matrix elements
 	UpdateAtVinv(_it_msr, stn1, 0, 0, design_row, design, AtVinv, buildnewMatrices);
 	
-	if (buildnewMatrices)
+	if (buildnewMatrices && !skipNormals)
 		// Add weighted measurement contributions to normal matrix
 		UpdateNormals_HIJPQR(stn1, design_row, normals, design, AtVinv);
 	else
 		design_row++;
 }
-	
+
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_H(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
-	// Initialise measurement.  No need to test if no further calculations are 
+	// Initialise measurement.  No need to test if no further calculations are
 	// required (as in stage mode), as this is done later (below)
 	InitialiseMeasurement(_it_msr, buildnewMatrices);
 
-	if (buildnewMatrices)
+	if (buildnewMatrices && !rebuildingDesign_)
 	{
 		it_vstn_t_const stn1_it(bstBinaryRecords_.begin() + (*_it_msr)->station1);
-	
+
 		// N value available?
 		if (fabs(stn1_it->geoidSep) > PRECISION_1E4)
 		{
@@ -5895,13 +6022,13 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_H(pit_vmsr_t _it_msr, UINT32& de
 	}
 
 	UpdateDesignNormalMeasMatrices_HR(_it_msr, design_row, block,
-		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 }
 
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_HR(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	UINT32 stn1(GetBlkMatrixElemStn1(block, _it_msr));
 
@@ -5935,31 +6062,31 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_HR(pit_vmsr_t _it_msr, UINT32& d
 	// Update AtVinv based on new design matrix elements
 	UpdateAtVinv(_it_msr, stn1, 0, 0, design_row, design, AtVinv, buildnewMatrices);
 	
-	if (buildnewMatrices)
+	if (buildnewMatrices && !skipNormals)
 		// Add weighted measurement contributions to normal matrix
 		UpdateNormals_HIJPQR(stn1, design_row, normals, design, AtVinv);
 	else
 		design_row++;
 }
-	
+
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_R(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
-	// Initialise measurement and test if no further calculations are 
+	// Initialise measurement and test if no further calculations are
 	// required (as in stage mode)
 	if (InitialiseMeasurement(_it_msr, buildnewMatrices))
 		return;
 
 	UpdateDesignNormalMeasMatrices_HR(_it_msr, design_row, block,
-		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices);
+		measMinusComp, estimatedStations, normals, design, AtVinv, buildnewMatrices, skipNormals);
 }
-	
+
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_X(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	it_vmsr_t _it_msr_first(*_it_msr);
 
@@ -5996,7 +6123,7 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_X(pit_vmsr_t _it_msr, UINT32& de
 
 	matrix_2d var_cart(baseline_count * 3, baseline_count * 3);
 
-	if (buildnewMatrices || projectSettings_.a.stage)
+	if (buildnewMatrices || projectSettings_.a.stage || skipNormals)
 	{
 		// Load apriori variance matrix, and assign to binary measurement
 		// If required, apply scalars
@@ -6006,10 +6133,10 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_X(pit_vmsr_t _it_msr, UINT32& de
 			return;
 	}
 
-	// If this method is called via PrepareAdjustment() and the adjustment 
+	// If this method is called via PrepareAdjustment() and the adjustment
 	// mode is staged, then don't update the AtVinv matrix.  This will be
 	// done during an adjustment via AdjustPhasedForward().
-	if (!buildnewMatrices && !projectSettings_.a.stage)
+	if (!buildnewMatrices && !projectSettings_.a.stage && !skipNormals)
 		return;
 
 	UINT32 cluster_cov;
@@ -6068,11 +6195,11 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_X(pit_vmsr_t _it_msr, UINT32& de
 			_it_msr_temp += 3;
 	}
 
-	if (projectSettings_.a.stage || !buildnewMatrices)
+	if (projectSettings_.a.stage || !buildnewMatrices || skipNormals)
 		return;
-	
+
 	_it_msr_temp = _it_msr_first;
-	
+
 	matrix_2d tmp0(3, 3);
 
 	// Build  At * V-1 * A variances
@@ -6151,8 +6278,8 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_X(pit_vmsr_t _it_msr, UINT32& de
 
 
 void dna_adjust::UpdateDesignNormalMeasMatrices_Y(pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
-											  matrix_2d* measMinusComp, matrix_2d* estimatedStations, 
-											  matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices)
+											  matrix_2d* measMinusComp, matrix_2d* estimatedStations,
+											  matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices, bool skipNormals)
 {
 	it_vmsr_t _it_msr_first(*_it_msr);
 	it_vmsr_t tmp_msr;
@@ -6365,23 +6492,23 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_Y(pit_vmsr_t _it_msr, UINT32& de
 
 	matrix_2d var_cart(point_count * 3, point_count * 3);
 
-	if (buildnewMatrices || projectSettings_.a.stage)
+	if (buildnewMatrices || projectSettings_.a.stage || skipNormals)
 	{
 		// Load apriori variance matrix, and assign to binary measurement
-		// If required, propagate to cartesian reference frame and apply 
+		// If required, propagate to cartesian reference frame and apply
 		// scalars
 		LoadVarianceMatrix_Y(_it_msr_first, &var_cart, coordType);
-	
+
 		// If preparing for a stage adjustment, return
 		// Normals will be built for each block as needed
 		if (buildnewMatrices && projectSettings_.a.stage)
 			return;
 	}
-	
-	// If this method is called via PrepareAdjustment() and the adjustment 
+
+	// If this method is called via PrepareAdjustment() and the adjustment
 	// mode is staged, then don't update the AtVinv matrix.  This will be
 	// done during an adjustment via AdjustPhasedForward().
-	if (!buildnewMatrices && !projectSettings_.a.stage)
+	if (!buildnewMatrices && !projectSettings_.a.stage && !skipNormals)
 		return;
 
 	UINT32 cluster_cov;
@@ -6424,9 +6551,9 @@ void dna_adjust::UpdateDesignNormalMeasMatrices_Y(pit_vmsr_t _it_msr, UINT32& de
 		design_row_begin += 3;
 	}
 
-	if (projectSettings_.a.stage || !buildnewMatrices)
+	if (projectSettings_.a.stage || !buildnewMatrices || skipNormals)
 		return;
-	
+
 	_it_msr_temp = _it_msr_first;
 
 	// Add to At * V-1 * A
@@ -6489,6 +6616,13 @@ void dna_adjust::SolveTry(bool COMPUTE_INVERSE, const UINT32& block)
 
 void dna_adjust::Solve(bool COMPUTE_INVERSE, const UINT32& block)
 {
+	assert(v_design_.at(block).rows() > 0 && "Solve: design has 0 rows");
+	assert(v_design_.at(block).columns() > 0 && "Solve: design has 0 columns");
+	assert(v_AtVinv_.at(block).rows() > 0 && "Solve: AtVinv has 0 rows");
+	assert(v_AtVinv_.at(block).columns() > 0 && "Solve: AtVinv has 0 columns");
+	assert(v_normals_.at(block).rows() > 0 && "Solve: normals has 0 rows");
+	assert(v_measMinusComp_.at(block).rows() > 0 && "Solve: measMinusComp has 0 rows");
+
 	// debug matrices if required
 	debug_SolutionInformation(block);
 
@@ -6508,33 +6642,24 @@ void dna_adjust::Solve(bool COMPUTE_INVERSE, const UINT32& block)
 		// the normal matrix before inversion and subsequently reversing the effect.
 		//
 
-		// 1. Create scalar matrix
-		matrix_2d *S = nullptr, *SN = nullptr;
+		// 1. Scale normals to unity using diagonal scaling
+		UINT32 n = v_normals_.at(block).rows();
+		std::vector<double> s_diag;
 
 		if (projectSettings_.a.scale_normals_to_unity)
 		{
-			S = new matrix_2d(v_normals_.at(block).rows(), v_normals_.at(block).rows());
-			SN = new matrix_2d(v_normals_.at(block).rows(), v_normals_.at(block).rows());
-			for (UINT32 i(0); i<v_normals_.at(block).rows(); ++i)
-				S->put(i, i, sqrt(v_normals_.at(block).get(i, i)));
-			// 2. Scale Normals to reduce the diagonal elements of Normals to unity
-			//SN->multiply(*S, v_normals_.at(block));
-			SN->multiply(*S, "N", v_normals_.at(block), "N");
-			
-			//v_normals_.at(block).multiply(*SN, *S);
-			v_normals_.at(block).multiply(*SN, "N", *S, "N");
+			s_diag.resize(n);
+			for (UINT32 i(0); i<n; ++i)
+				s_diag[i] = 1.0 / sqrt(v_normals_.at(block).get(i, i));
+			v_normals_.at(block).scale_symmetric_diagonal(s_diag.data());
 		}
 		//////////////////
-	
-		// Clear upper triangle to match expected structure for cholesky_inverse
-		v_normals_.at(block).clearupper();
-		
+
 		// Calculate Inverse of AT * V-1 * A
-		// Explicitly set LOWER_IS_CLEARED to false (data is in lower triangle)
-		FormInverseVarianceMatrix(&(v_normals_.at(block)), false);
+		FormInverseVarianceMatrix(&(v_normals_.at(block)), false, false);
 
 		// Check for a failed inverse solution
-		if (boost::math::isnan(v_normals_.at(block).get(0, 0)) || 
+		if (boost::math::isnan(v_normals_.at(block).get(0, 0)) ||
 			boost::math::isinf(v_normals_.at(block).get(0, 0)))
 		{
 			std::stringstream ss;
@@ -6544,17 +6669,10 @@ void dna_adjust::Solve(bool COMPUTE_INVERSE, const UINT32& block)
 		}
 
 		//////////////////
-		// 2. Compute inverse of N (via S * (SNS)-1 * S)
+		// 2. Reverse the scaling on the inverse
 		if (projectSettings_.a.scale_normals_to_unity)
 		{
-			//SN->multiply(*S, v_normals_.at(block));
-			SN->multiply(*S, "N", v_normals_.at(block), "N");
-
-			//v_normals_.at(block).multiply(*SN, *S);
-			v_normals_.at(block).multiply(*SN, "N", *S, "N");
-
-			delete S;
-			delete SN;
+			v_normals_.at(block).scale_symmetric_diagonal(s_diag.data());
 		}
 		//////////////////
 	}
@@ -6574,7 +6692,10 @@ void dna_adjust::Solve(bool COMPUTE_INVERSE, const UINT32& block)
 	
 	// Solve corrections from normal equations
 	v_corrections_.at(block).redim(v_design_.at(block).columns(), 1);
-	v_corrections_.at(block).multiply(v_normals_.at(block), "N", At_Vinv_m, "N");
+	if (v_normals_.at(block).is_symmetric())
+		v_corrections_.at(block).multiply_sym(v_normals_.at(block), At_Vinv_m);
+	else
+		v_corrections_.at(block).multiply(v_normals_.at(block), "N", At_Vinv_m, "N");
 
 	if (projectSettings_.g.verbose > 0)
 	{
@@ -6627,60 +6748,663 @@ void dna_adjust::Solve(bool COMPUTE_INVERSE, const UINT32& block)
 }
 	
 
-void dna_adjust::DeSerialiseAdjustedVarianceMatrices()
+// ============================================================================
+// Levenberg-Marquardt / Trust-Region methods
+// ============================================================================
+
+void dna_adjust::InitialiseLMState()
 {
-	// No need to facilitate serialising if network adjustment is in stage,
-	// as this will already be taken care of
-	if (projectSettings_.a.stage)
-		return;
+	lm_lambda_ = projectSettings_.a.lm_lambda_init;
+	chiSquaredPrev_ = 0.0;
 
-	// Create file streams and memory map regions
-	OpenStageFileStreams(2, sf_rigorous_vars, sf_prec_adj_msrs);
-	ReserveBlockMapRegions(2, sf_rigorous_vars, sf_prec_adj_msrs);
+	// Resize backup/gradient/station-backup vectors to match block count
+	v_normalsBackup_.resize(blockCount_);
+	v_gradient_.resize(blockCount_);
+	v_estimatedStationsBackup_.resize(blockCount_);
+}
 
-	for (UINT32 block=0; block<blockCount_; ++block)
+void dna_adjust::SolveLM(bool COMPUTE_FACTOR, const UINT32& block, bool COMPUTE_INVERSE)
+{
+	assert(v_design_.at(block).rows() > 0 && "SolveLM: design has 0 rows");
+	assert(v_design_.at(block).columns() > 0 && "SolveLM: design has 0 columns");
+	assert(v_AtVinv_.at(block).rows() > 0 && "SolveLM: AtVinv has 0 rows");
+	assert(v_AtVinv_.at(block).columns() > 0 && "SolveLM: AtVinv has 0 columns");
+	assert(v_normals_.at(block).rows() > 0 && "SolveLM: normals has 0 rows");
+	assert(v_measMinusComp_.at(block).rows() > 0 && "SolveLM: measMinusComp has 0 rows");
+
+	debug_SolutionInformation(block);
+
+	// 1. Compute gradient: g = At * V^-1 * (meas - comp)
+	UINT32 n = v_normals_.at(block).rows();
+	v_gradient_.at(block).redim(n, 1);
+	v_gradient_.at(block).multiply(v_AtVinv_.at(block), "N", v_measMinusComp_.at(block), "N");
+
+	// Diagonal scaling factors (function-scope so both factor and solve steps can use them)
+	std::vector<double> s_diag;
+	bool useScaling = projectSettings_.a.scale_normals_to_unity;
+
+	if (COMPUTE_FACTOR)
 	{
-		// Set Region offsets
-		SetRegionOffsets(block, 2, sf_rigorous_vars, sf_prec_adj_msrs);
+		// 2a. Backup pristine normals (before augmentation)
+		v_normalsBackup_.at(block) = v_normals_.at(block);
 
-		// Prepare mapping regions
-		PrepareMemoryMapRegions(block, 2, sf_rigorous_vars, sf_prec_adj_msrs);
-	}
+		// 2b. Apply optional diagonal scaling
+		if (useScaling)
+		{
+			s_diag.resize(n);
+			for (UINT32 i(0); i < n; ++i)
+				s_diag[i] = 1.0 / sqrt(v_normals_.at(block).get(i, i));
+			v_normals_.at(block).scale_symmetric_diagonal(s_diag.data());
+		}
+
+		// 2c. Augment diagonal with identity damping: N_ii += lambda
+		//     Identity damping (N + lambda*I) is used instead of Marquardt scaling
+		//     (N + lambda*diag(N)) because the latter amplifies damping by the diagonal
+		//     magnitude, which can be excessively aggressive for GPS networks where
+		//     diagonal elements (~1e6) dwarf the smallest eigenvalues (~1e3).
+		//     When scaling is active, we add lambda/N_ii to the scaled diagonal so
+		//     that the effective damping in the original (unscaled) space is lambda*I.
+		for (UINT32 i(0); i < n; ++i)
+		{
+			double diag_val = useScaling ? (1.0 / v_normalsBackup_.at(block).get(i, i)) : 1.0;
+			v_normals_.at(block).elementadd(i, i, lm_lambda_ * diag_val);
+		}
 
-	
-	try {
-		// Set memory mapped file regions
-		// NOTE - previously created files must exist and match the 
-		// dimensions of the current matrix sizes			
-		SetMapRegions(2, sf_rigorous_vars, sf_prec_adj_msrs);
-		
-	}
-	catch (boost::interprocess::interprocess_exception& e){
-		std::stringstream ss;
-		ss << "DeSerialiseAdjustedVarianceMatrices() terminated while creating memory map" << std::endl;
-		ss << "  regions from mtx file. Details:\n  " << e.what() << std::endl;
-		adj_file << std::endl << "- Error: " << ss.str() << std::endl;
-		SignalExceptionAdjustment(ss.str(), 0);
+		// 2d. Cholesky factor (dpotrf only, no inverse)
+		//     No clearupper() needed — dpotrf(uplo='L') reads only the lower triangle,
+		//     and packed storage has no upper triangle to clear.
+		v_normals_.at(block).cholesky_factor(false);
+
+		// 2f. Scale gradient into scaled space for solve: g_scaled = S * g
+		if (useScaling)
+		{
+			for (UINT32 i(0); i < n; ++i)
+				*v_gradient_.at(block).getbuffer(i, 0) *= s_diag[i];
+		}
 	}
 
-	// Now load up matrix from .mtx file
-	for (UINT32 block=0; block<blockCount_; ++block)
+	// 3. Solve: (N + lambda*D) * delta = g  via forward/back substitution
+	//    The solve operates in scaled space when scaling is active.
+	v_corrections_.at(block).redim(n, 1);
+	// Copy (possibly scaled) gradient to corrections (dpotrs overwrites rhs in-place)
+	for (UINT32 i(0); i < n; ++i)
+		v_corrections_.at(block).put(i, 0, v_gradient_.at(block).get(i, 0));
+
+	v_normals_.at(block).cholesky_solve(v_corrections_.at(block), false);
+
+	// 4. Un-scale corrections and gradient back to original space
+	if (COMPUTE_FACTOR && useScaling)
 	{
-		// Read this block
-		DeserialiseBlockFromMappedFile(block, 2,
-			sf_rigorous_vars, sf_prec_adj_msrs);
+		// corrections: delta = S * delta_scaled
+		for (UINT32 i(0); i < n; ++i)
+			*v_corrections_.at(block).getbuffer(i, 0) *= s_diag[i];
+		// gradient: restore g from g_scaled = S * g  =>  g = g_scaled / S = g_scaled * sqrt(N_ii)
+		for (UINT32 i(0); i < n; ++i)
+			*v_gradient_.at(block).getbuffer(i, 0) /= s_diag[i];
 	}
 
-	// Close rigorous variances stream
-	f_rigorousVariances_.close();
-	f_precAdjMsrs_.close();
-}
-	
+	// At this point, v_gradient_ and v_corrections_ are both in original (unscaled) space,
+	// so ComputePredictedReduction() will compute g^T * delta + lambda * delta^T * D * delta
+	// consistently.
 
-void dna_adjust::SerialiseAdjustedVarianceMatrices()
-{
-	// No need to facilitate serialising if network adjustment is in stage,
-	// as this will already be taken care of
+	// 5. Compute inverse of undamped normals (for phased junction carry-forward).
+	//    In phased mode, after solving with LM damping the code needs N^-1 in
+	//    v_normals_ for the junction station variance carry-forward.  Restore
+	//    the pristine (undamped) normals from backup, then invert.
+	if (COMPUTE_INVERSE)
+	{
+		v_normals_.at(block) = v_normalsBackup_.at(block);
+		if (useScaling)
+		{
+			// Recompute scaling from restored normals
+			s_diag.resize(n);
+			for (UINT32 i(0); i < n; ++i)
+				s_diag[i] = 1.0 / sqrt(v_normals_.at(block).get(i, i));
+			v_normals_.at(block).scale_symmetric_diagonal(s_diag.data());
+		}
+
+		FormInverseVarianceMatrix(&v_normals_.at(block), false, false);
+
+		if (useScaling)
+			v_normals_.at(block).scale_symmetric_diagonal(s_diag.data());
+	}
+
+	if (projectSettings_.g.verbose > 0)
+	{
+		if (projectSettings_.a.multi_thread)
+			dbg_file_mutex.lock();
+
+		debug_file << "Block " << block + 1 << " (LM, lambda=" << lm_lambda_ << ")" << std::endl;
+		debug_file << "Corrections " << std::fixed << std::setprecision(16) << v_corrections_.at(block) << std::endl;
+		debug_file.flush();
+
+		if (projectSettings_.a.multi_thread)
+			dbg_file_mutex.unlock();
+	}
+}
+
+void dna_adjust::SolveLMTry(bool COMPUTE_FACTOR, const UINT32& block, bool COMPUTE_INVERSE)
+{
+	try {
+		SolveLM(COMPUTE_FACTOR, block, COMPUTE_INVERSE);
+	}
+	catch (const std::runtime_error& e) {
+		debug_SolutionInformation(block);
+		SignalExceptionAdjustment(e.what(), block);
+	}
+}
+
+double dna_adjust::ComputePredictedReduction(const UINT32& block)
+{
+	// Predicted reduction = g^T * delta + lambda * delta^T * delta
+	// Uses identity damping (D = I), consistent with the augmentation
+	// in SolveLM which adds lambda*I to the normals.
+	double gTd = v_gradient_.at(block).dot(v_corrections_.at(block));
+	double dTd = 0.0;
+	UINT32 n = v_corrections_.at(block).rows();
+	for (UINT32 i = 0; i < n; ++i)
+	{
+		double d_i = v_corrections_.at(block).get(i, 0);
+		dTd += d_i * d_i;
+	}
+	return gTd + lm_lambda_ * dTd;
+}
+
+double dna_adjust::ComputeNetworkPredictedReduction()
+{
+	double total = 0.0;
+	for (UINT32 block = 0; block < blockCount_; ++block)
+		total += ComputePredictedReduction(block);
+	return total;
+}
+
+
+// -----------------------------------------------------------------------
+// Anderson acceleration for phased Gauss-Newton
+// -----------------------------------------------------------------------
+
+void dna_adjust::InitialiseAndersonAcceleration()
+{
+	UINT32 m = projectSettings_.a.aa_depth;
+	UINT32 n = static_cast<UINT32>(bstBinaryRecords_.size()) * 3;
+
+	aa_G_hist_.resize(m);
+	aa_F_hist_.resize(m);
+	for (UINT32 i = 0; i < m; ++i)
+	{
+		aa_G_hist_[i].resize(n, 0.0);
+		aa_F_hist_[i].resize(n, 0.0);
+	}
+	aa_x_snapshot_.resize(n, 0.0);
+	aa_hist_count_ = 0;
+	aa_hist_pos_ = 0;
+}
+
+void dna_adjust::ExtractStationCoordinates(std::vector<double>& coords)
+{
+	UINT32 totalStations = static_cast<UINT32>(bstBinaryRecords_.size());
+	coords.resize(totalStations * 3);
+
+	// Use a visited flag to avoid double-writing junction stations
+	std::vector<bool> visited(totalStations, false);
+
+	for (UINT32 block = 0; block < blockCount_; ++block)
+	{
+		const auto& stnList = v_parameterStationList_.at(block);
+		const auto& estMat = v_estimatedStations_.at(block);
+
+		for (UINT32 s = 0; s < stnList.size(); ++s)
+		{
+			UINT32 bstIdx = stnList[s];
+			if (visited[bstIdx])
+				continue;
+			visited[bstIdx] = true;
+
+			UINT32 row = s * 3;
+			UINT32 ci = bstIdx * 3;
+			coords[ci]     = estMat.get(row, 0);
+			coords[ci + 1] = estMat.get(row + 1, 0);
+			coords[ci + 2] = estMat.get(row + 2, 0);
+		}
+	}
+}
+
+void dna_adjust::ScatterStationCoordinates(const std::vector<double>& coords)
+{
+	for (UINT32 block = 0; block < blockCount_; ++block)
+	{
+		const auto& stnList = v_parameterStationList_.at(block);
+		auto& estMat = v_estimatedStations_.at(block);
+
+		for (UINT32 s = 0; s < stnList.size(); ++s)
+		{
+			UINT32 bstIdx = stnList[s];
+			UINT32 row = s * 3;
+			UINT32 ci = bstIdx * 3;
+			estMat.put(row, 0, coords[ci]);
+			estMat.put(row + 1, 0, coords[ci + 1]);
+			estMat.put(row + 2, 0, coords[ci + 2]);
+		}
+	}
+}
+
+void dna_adjust::ApplyAndersonAcceleration(UINT32 iteration)
+{
+	UINT32 m = projectSettings_.a.aa_depth;
+	UINT32 n = static_cast<UINT32>(aa_x_snapshot_.size());
+
+	// Extract g_k = G(x_k) — post-iteration coordinates
+	std::vector<double> g_k(n);
+	ExtractStationCoordinates(g_k);
+
+	// Compute f_k = g_k - x_k (the fixed-point residual)
+	std::vector<double> f_k(n);
+	for (UINT32 i = 0; i < n; ++i)
+		f_k[i] = g_k[i] - aa_x_snapshot_[i];
+
+	// Store in ring buffer
+	aa_G_hist_[aa_hist_pos_] = g_k;
+	aa_F_hist_[aa_hist_pos_] = f_k;
+	aa_hist_pos_ = (aa_hist_pos_ + 1) % m;
+	if (aa_hist_count_ < m)
+		aa_hist_count_++;
+
+	// Need at least 2 history entries to mix
+	if (aa_hist_count_ < 2)
+		return;
+
+	// m_k = number of difference columns
+	UINT32 m_k = aa_hist_count_ - 1;
+
+	// Build ΔF columns: df[j] = f_k - f_{k-j-1}
+	// The most recent entry is at (aa_hist_pos_ - 1) % m
+	// The entry before that is at (aa_hist_pos_ - 2) % m, etc.
+	UINT32 curr = (aa_hist_pos_ + m - 1) % m;
+
+	// Compute Gram matrix (ΔF^T ΔF) and RHS (ΔF^T f_k) — accumulate over stations
+	std::vector<double> gram(m_k * m_k, 0.0);
+	std::vector<double> rhs(m_k, 0.0);
+
+	for (UINT32 j = 0; j < m_k; ++j)
+	{
+		UINT32 prev_j = (curr + m - j - 1) % m;
+		for (UINT32 k = j; k < m_k; ++k)
+		{
+			UINT32 prev_k = (curr + m - k - 1) % m;
+			double dot = 0.0;
+			for (UINT32 i = 0; i < n; ++i)
+			{
+				double df_j = f_k[i] - aa_F_hist_[prev_j][i];
+				double df_k = f_k[i] - aa_F_hist_[prev_k][i];
+				dot += df_j * df_k;
+			}
+			gram[j * m_k + k] = dot;
+			gram[k * m_k + j] = dot;  // symmetric
+		}
+		// RHS: ΔF^T f_k
+		double dot = 0.0;
+		for (UINT32 i = 0; i < n; ++i)
+		{
+			double df_j = f_k[i] - aa_F_hist_[prev_j][i];
+			dot += df_j * f_k[i];
+		}
+		rhs[j] = dot;
+	}
+
+	// Solve Gram * gamma = rhs via Cholesky (in-place)
+	// If ill-conditioned, skip acceleration
+	std::vector<double> gamma(rhs);
+	for (UINT32 j = 0; j < m_k; ++j)
+	{
+		double sum = gram[j * m_k + j];
+		for (UINT32 k = 0; k < j; ++k)
+			sum -= gram[j * m_k + k] * gram[j * m_k + k];
+		if (sum <= 1e-30)
+			return;  // ill-conditioned, skip acceleration
+		gram[j * m_k + j] = std::sqrt(sum);
+		for (UINT32 i = j + 1; i < m_k; ++i)
+		{
+			sum = gram[i * m_k + j];
+			for (UINT32 k = 0; k < j; ++k)
+				sum -= gram[i * m_k + k] * gram[j * m_k + k];
+			gram[i * m_k + j] = sum / gram[j * m_k + j];
+		}
+	}
+	// Forward substitution: L y = rhs
+	for (UINT32 j = 0; j < m_k; ++j)
+	{
+		for (UINT32 k = 0; k < j; ++k)
+			gamma[j] -= gram[j * m_k + k] * gamma[k];
+		gamma[j] /= gram[j * m_k + j];
+	}
+	// Back substitution: L^T gamma = y
+	for (int j = static_cast<int>(m_k) - 1; j >= 0; --j)
+	{
+		for (UINT32 k = static_cast<UINT32>(j) + 1; k < m_k; ++k)
+			gamma[j] -= gram[k * m_k + j] * gamma[k];
+		gamma[j] /= gram[j * m_k + j];
+	}
+
+	// Compute mixed iterate: x_{k+1} = g_k - ΔG * gamma
+	// where ΔG[j] = g_k - g_{k-j-1}
+	std::vector<double> x_mixed(g_k);
+	for (UINT32 j = 0; j < m_k; ++j)
+	{
+		UINT32 prev_j = (curr + m - j - 1) % m;
+		for (UINT32 i = 0; i < n; ++i)
+			x_mixed[i] -= gamma[j] * (g_k[i] - aa_G_hist_[prev_j][i]);
+	}
+
+	// Safety check: reject if mixed correction is larger than un-mixed
+	double norm_unmixed = 0.0, norm_mixed = 0.0;
+	for (UINT32 i = 0; i < n; ++i)
+	{
+		double d_unmixed = g_k[i] - aa_x_snapshot_[i];
+		double d_mixed = x_mixed[i] - aa_x_snapshot_[i];
+		norm_unmixed += d_unmixed * d_unmixed;
+		norm_mixed += d_mixed * d_mixed;
+	}
+
+	if (norm_mixed > norm_unmixed)
+		return;  // reject — keep un-mixed iterate
+
+	// Accept: scatter mixed coordinates back
+	ScatterStationCoordinates(x_mixed);
+
+	// Update maxCorr_ based on mixed correction
+	double maxCorrMixed = 0.0;
+	for (UINT32 i = 0; i < n; ++i)
+	{
+		double corr = std::fabs(x_mixed[i] - aa_x_snapshot_[i]);
+		if (corr > maxCorrMixed)
+			maxCorrMixed = corr;
+	}
+	maxCorr_ = (maxCorr_ > 0.0 ? maxCorrMixed : -maxCorrMixed);
+}
+
+void dna_adjust::AdjustSimultaneousLM()
+{
+	adjustStatus_ = ADJUST_SUCCESS;
+	initialiseIteration();
+	InitialiseLMState();
+
+	// Reset oscillation diagnostics
+	corrPrev_.clear();
+	stnOscCount_.clear();
+	oscHistory_.clear();
+
+	std::ostringstream ss;
+	std::string corr_msg;
+	cpu_timer it_time, tot_time;
+
+	UINT32 iterCount = 0;
+
+	while (iterCount < projectSettings_.a.max_iterations)
+	{
+		if (IsCancelled())
+			break;
+
+		isIterationComplete_ = false;
+
+		blockLargeCorr_ = 0;
+		largestCorr_ = 0.0;
+
+		if (projectSettings_.o._cmp_msr_iteration)
+			printer_->PrintCompMeasurements(0, "a-priori");
+
+		ss.str("");
+		it_time.start();
+
+		// Solve with LM damping.
+		// Always recompute the Cholesky factor because the LM solver restores
+		// the pristine (unfactored) normals from backup after each step,
+		// and lambda may have changed between iterations.
+		SolveLMTry(true, 0);
+
+		// Print iteration to adj file
+		printer_->PrintIteration(incrementIteration());
+		PrintAdjustmentTime(it_time, iteration_time);
+
+		maxCorr_ = v_corrections_.at(0).compute_maximum_value();
+		OutputLargestCorrection(corr_msg);
+		UpdateIterationDiagnostics();
+
+		// Apply corrections
+		v_estimatedStations_.at(0).add(v_corrections_.at(0));
+
+		// Update geographic coordinates if needed
+		if (v_msrTally_.at(0).ContainsNonGPS())
+			UpdateGeographicCoords();
+
+		if (projectSettings_.g.verbose > 0)
+		{
+			debug_file << "LM iteration " << CurrentIteration()
+				<< ": lambda=" << lm_lambda_
+				<< ", maxCorr=" << maxCorr_
+				<< std::endl;
+			debug_file.flush();
+		}
+
+		// Reduce damping each iteration — the damping serves as regularisation
+		// for early iterations and decays to zero (pure GN) as convergence nears
+		lm_lambda_ /= projectSettings_.a.lm_gamma_down;
+
+		iterationCorrections_.add_message(corr_msg);
+		iterationQueue_.push_and_notify(CurrentIteration());
+		isIterationComplete_ = true;
+
+		// Convergence check
+		bool iterate = !IsCancelled() && fabs(maxCorr_) > projectSettings_.a.iteration_threshold;
+		if (!iterate)
+			break;
+
+		// Per-iteration optional output
+		if (projectSettings_.o._adj_stat_iteration)
+		{
+			ComputeStatisticsOnIteration();
+			printer_->PrintStatistics(false);
+		}
+		if (projectSettings_.o._adj_msr_iteration)
+			ComputeandPrintAdjMsrOnIteration();
+		if (projectSettings_.o._adj_stn_iteration)
+			printer_->PrintBlockStations(adj_file, 0, &v_estimatedStations_.at(0), &v_normals_.at(0),
+				false, !v_msrTally_.at(0).ContainsNonGPS(), !v_msrTally_.at(0).ContainsNonGPS(), true, false);
+
+		iterCount++;
+
+		// Rebuild normals for next iteration
+		bool lastIteration = (iterCount >= projectSettings_.a.max_iterations);
+
+		// Restore normals from backup before UpdateAdjustment rebuilds
+		v_normals_.at(0) = v_normalsBackup_.at(0);
+		UpdateAdjustment(!lastIteration);
+	}
+
+	// Final: restore undamped N and compute full inverse for statistics
+	v_normals_.at(0) = v_normalsBackup_.at(0);
+
+	// Apply same scaling as original Solve() if needed
+	UINT32 n = v_normals_.at(0).rows();
+	std::vector<double> s_diag;
+	if (projectSettings_.a.scale_normals_to_unity)
+	{
+		s_diag.resize(n);
+		for (UINT32 i(0); i < n; ++i)
+			s_diag[i] = 1.0 / sqrt(v_normals_.at(0).get(i, i));
+		v_normals_.at(0).scale_symmetric_diagonal(s_diag.data());
+	}
+
+	// No clearupper() — FormInverseVarianceMatrix/cholesky_inverse handles
+	// both packed and full matrices directly (dpotrf(L) reads only the lower triangle).
+	FormInverseVarianceMatrix(&v_normals_.at(0), false, false);
+
+	if (projectSettings_.a.scale_normals_to_unity)
+		v_normals_.at(0).scale_symmetric_diagonal(s_diag.data());
+
+	ValidateandFinaliseAdjustment(tot_time);
+}
+
+
+void dna_adjust::AdjustPhasedLM()
+{
+	initialiseIteration();
+	InitialiseLMState();
+
+	std::string corr_msg;
+	bool iterate(true);
+
+	cpu_timer it_time, tot_time;
+
+	UINT32 iterCount = 0;
+
+	while (iterCount < projectSettings_.a.max_iterations)
+	{
+		if (IsCancelled())
+			break;
+
+		isIterationComplete_ = false;
+
+		blockLargeCorr_ = 0;
+		largestCorr_ = 0.0;
+		maxCorr_ = 0.0;
+
+		potentialOutlierCount_ = 0;
+		chiSquaredStage_ = 0.;
+		measurementParams_ = 0;
+
+		// Print the iteration # to adj file
+		printer_->PrintIteration(incrementIteration());
+
+		it_time.start();
+
+		AdjustPhasedForward();
+		if (IsCancelled())
+			break;
+
+		AdjustPhasedReverseCombine();
+		if (IsCancelled())
+			break;
+
+		// Calculate and print total time
+		PrintAdjustmentTime(it_time, iteration_time);
+
+		// Calculate and print largest adjustment correction and station ID
+		OutputLargestCorrection(corr_msg);
+		UpdateIterationDiagnostics();
+
+		if (projectSettings_.g.verbose > 0)
+		{
+			debug_file << "LM phased iteration " << CurrentIteration()
+				<< ": lambda=" << lm_lambda_
+				<< ", maxCorr=" << maxCorr_
+				<< std::endl;
+			debug_file.flush();
+		}
+
+		// Reduce damping each iteration — the damping serves as regularisation
+		// for early iterations and decays to zero (pure GN) as convergence nears
+		lm_lambda_ /= projectSettings_.a.lm_gamma_down;
+
+		iterationCorrections_.add_message(corr_msg);
+		iterationQueue_.push_and_notify(CurrentIteration());
+		isIterationComplete_ = true;
+
+		// Continue iterating?
+		iterate = !IsCancelled() && fabs(maxCorr_) > projectSettings_.a.iteration_threshold;
+		if (!iterate)
+			break;
+
+		// Update normals and measured-computed matrices for the next iteration
+		UpdateAdjustment(iterate);
+		if (IsCancelled())
+			break;
+
+		if (projectSettings_.o._adj_stat_iteration)
+		{
+			ComputeStatisticsOnIteration();
+			printer_->PrintStatistics(false);
+		}
+		if (projectSettings_.o._adj_msr_iteration)
+			ComputeandPrintAdjMsrOnIteration();
+
+		iterCount++;
+	}
+
+	chiSquared_ = chiSquaredStage_;
+
+	ValidateandFinaliseAdjustment(tot_time);
+}
+
+
+void dna_adjust::AdjustPhasedMultiThreadLM()
+{
+	// The LM phased solver currently runs single-threaded.  Temporarily clear
+	// multi_thread so that UpdateEstimatesFinal, UpdateAdjustment, and other
+	// helpers use the regular (non-MT) matrices consistently.  Without this,
+	// grow/shrink operations during junction carry-forward target different
+	// matrix sets, leaving stale pseudo-measurements in the AtVinv and
+	// measMinusComp matrices after the reverse sweep.
+	bool savedMultiThread = projectSettings_.a.multi_thread;
+	projectSettings_.a.multi_thread = false;
+	AdjustPhasedLM();
+	projectSettings_.a.multi_thread = savedMultiThread;
+}
+
+
+void dna_adjust::DeSerialiseAdjustedVarianceMatrices()
+{
+	// No need to facilitate serialising if network adjustment is in stage,
+	// as this will already be taken care of
+	if (projectSettings_.a.stage)
+		return;
+
+	// Create file streams and memory map regions
+	OpenStageFileStreams(2, sf_rigorous_vars, sf_prec_adj_msrs);
+	ReserveBlockMapRegions(2, sf_rigorous_vars, sf_prec_adj_msrs);
+
+	for (UINT32 block=0; block<blockCount_; ++block)
+	{
+		// Set Region offsets
+		SetRegionOffsets(block, 2, sf_rigorous_vars, sf_prec_adj_msrs);
+
+		// Prepare mapping regions
+		PrepareMemoryMapRegions(block, 2, sf_rigorous_vars, sf_prec_adj_msrs);
+	}
+
+	
+	try {
+		// Set memory mapped file regions
+		// NOTE - previously created files must exist and match the 
+		// dimensions of the current matrix sizes			
+		SetMapRegions(2, sf_rigorous_vars, sf_prec_adj_msrs);
+		
+	}
+	catch (boost::interprocess::interprocess_exception& e){
+		std::stringstream ss;
+		ss << "DeSerialiseAdjustedVarianceMatrices() terminated while creating memory map" << std::endl;
+		ss << "  regions from mtx file. Details:\n  " << e.what() << std::endl;
+		adj_file << std::endl << "- Error: " << ss.str() << std::endl;
+		SignalExceptionAdjustment(ss.str(), 0);
+	}
+
+	// Now load up matrix from .mtx file
+	for (UINT32 block=0; block<blockCount_; ++block)
+	{
+		// Read this block
+		DeserialiseBlockFromMappedFile(block, 2,
+			sf_rigorous_vars, sf_prec_adj_msrs);
+	}
+
+	// Close rigorous variances stream
+	f_rigorousVariances_.close();
+	f_precAdjMsrs_.close();
+}
+	
+
+void dna_adjust::SerialiseAdjustedVarianceMatrices()
+{
+	// No need to facilitate serialising if network adjustment is in stage,
+	// as this will already be taken care of
 	if (projectSettings_.a.stage)
 		return;
 
@@ -6858,16 +7582,15 @@ void dna_adjust::ComputeAdjustedMsrPrecisions()
 			if (projectSettings_.a.stage)
 			{
 				DeserialiseBlockFromMappedFile(block, 7,
-					sf_normals, sf_rigorous_vars, 
+					sf_normals, sf_rigorous_vars,
 					sf_design, sf_atvinv, sf_estimated_stns,
 					sf_meas_minus_comp, sf_prec_adj_msrs);
 				v_normals_.at(block) = v_rigorousVariances_.at(block);
 
 				FillDesignNormalMeasurementsMatrices(false, block, false);
 			}
-
 			// Compute adjusted measurement precisions (v_precAdjMsrsFull_)
-			// from design and rigorous station variances 
+			// from design and rigorous station variances
 			ComputePrecisionAdjMsrs(block);
 
 			// Update measurement records and Pelzer's Global reliability
@@ -6881,9 +7604,9 @@ void dna_adjust::ComputeAdjustedMsrPrecisions()
 	case SimultaneousMode:
 	case Phased_Block_1Mode:			// only block 1 is rigorous
 		// Compute adjusted measurement precisions (v_precAdjMsrsFull_)
-		// from design and rigorous station variances 
+		// from design and rigorous station variances
 		ComputePrecisionAdjMsrs();
-		
+
 		// Update measurement records and Pelzer's Global reliability
 		UpdateMsrRecords();
 		break;
@@ -7343,7 +8066,167 @@ void dna_adjust::OutputLargestCorrection(std::string& formatted_msg)
 
 	formatted_msg = ss.str();
 }
-	
+
+void dna_adjust::UpdateIterationDiagnostics()
+{
+	for (UINT32 block = 0; block < blockCount_; ++block)
+	{
+		// For staged adjustments, load corrections from mapped file
+		if (projectSettings_.a.stage)
+			DeserialiseBlockFromMappedFile(block, 1, sf_corrections);
+
+		UINT32 nParams = v_corrections_.at(block).rows();
+		UINT32 nStations = nParams / 3;
+
+		for (UINT32 s = 0; s < nStations; ++s)
+		{
+			UINT32 stnIdx = v_parameterStationList_.at(block).at(s);
+			UINT32 base = s * 3;
+
+			double cx = v_corrections_.at(block).get(base, 0);
+			double cy = v_corrections_.at(block).get(base + 1, 0);
+			double cz = v_corrections_.at(block).get(base + 2, 0);
+
+			double magCurr = sqrt(cx*cx + cy*cy + cz*cz);
+
+			auto prevIt = corrPrev_.find(stnIdx);
+			if (prevIt == corrPrev_.end())
+			{
+				// First time seeing this station — store and move on
+				corrPrev_[stnIdx] = {cx, cy, cz};
+				continue;
+			}
+
+			double px = prevIt->second.cx;
+			double py = prevIt->second.cy;
+			double pz = prevIt->second.cz;
+			double magPrev = sqrt(px*px + py*py + pz*pz);
+
+			// Update stored corrections
+			prevIt->second = {cx, cy, cz};
+
+			// Skip tiny corrections (sub-millimetre)
+			if (magCurr < 0.001 && magPrev < 0.001)
+			{
+				stnOscCount_[stnIdx] = 0;
+				continue;
+			}
+
+			// Dot product and cosine of angle between correction vectors
+			double dot = cx*px + cy*py + cz*pz;
+			double denom = magCurr * magPrev;
+			double cosAngle = (denom > 1e-30) ? dot / denom : 0.0;
+
+			// Magnitude ratio
+			double ratio = (magPrev > 1e-30) ? magCurr / magPrev : 0.0;
+
+			// Oscillation: anti-parallel (cos < -0.5), similar magnitude (ratio 0.3–3.0)
+			if (cosAngle < -0.5 && ratio > 0.3 && ratio < 3.0)
+				stnOscCount_[stnIdx]++;
+			else
+				stnOscCount_[stnIdx] = 0;
+
+			if (stnOscCount_[stnIdx] >= 2)
+			{
+				// Convert to local (e, n, up)
+				matrix_2d cart(3, 1), local(3, 1);
+				cart.put(0, 0, cx);
+				cart.put(1, 0, cy);
+				cart.put(2, 0, cz);
+				Rotate_CartLocal<double>(cart, &local,
+					bstBinaryRecords_.at(stnIdx).currentLatitude,
+					bstBinaryRecords_.at(stnIdx).currentLongitude);
+
+				double localMag = sqrt(local.get(0,0)*local.get(0,0) +
+					local.get(1,0)*local.get(1,0) + local.get(2,0)*local.get(2,0));
+
+				auto hit = oscHistory_.find(stnIdx);
+				if (hit == oscHistory_.end())
+				{
+					OscillationRecord rec;
+					rec.stnBstIdx = stnIdx;
+					rec.firstIteration = CurrentIteration();
+					rec.lastIteration = CurrentIteration();
+					rec.maxCycles = stnOscCount_[stnIdx];
+					rec.firstMag = localMag;
+					rec.lastMag = localMag;
+					rec.lastE = local.get(0, 0);
+					rec.lastN = local.get(1, 0);
+					rec.lastUp = local.get(2, 0);
+					oscHistory_[stnIdx] = rec;
+				}
+				else
+				{
+					hit->second.lastIteration = CurrentIteration();
+					hit->second.maxCycles = stnOscCount_[stnIdx];
+					hit->second.lastMag = localMag;
+					hit->second.lastE = local.get(0, 0);
+					hit->second.lastN = local.get(1, 0);
+					hit->second.lastUp = local.get(2, 0);
+				}
+			}
+		}
+
+		// For staged adjustments, unload corrections
+		if (projectSettings_.a.stage)
+			UnloadBlock(block, 1, sf_corrections);
+	}
+}
+
+void dna_adjust::PrintOscillationSummary()
+{
+	if (oscHistory_.empty())
+		return;
+
+	// Collect records with meaningful magnitude, sort descending
+	std::vector<const OscillationRecord*> sorted;
+	for (auto& kv : oscHistory_)
+	{
+		double peak = std::max(kv.second.firstMag, kv.second.lastMag);
+		if (peak >= 0.1)
+			sorted.push_back(&kv.second);
+	}
+
+	if (sorted.empty())
+		return;
+
+	std::sort(sorted.begin(), sorted.end(),
+		[](const OscillationRecord* a, const OscillationRecord* b) {
+			return std::max(a->firstMag, a->lastMag) > std::max(b->firstMag, b->lastMag);
+		});
+
+	// Cap at 20 stations
+	size_t limit = std::min(sorted.size(), static_cast<size_t>(20));
+
+	std::cout << std::endl;
+	std::cout << "+ Oscillating stations detected (" << sorted.size() << " total, showing top "
+		<< limit << "):" << std::endl;
+
+	for (size_t i = 0; i < limit; ++i)
+	{
+		const auto* rec = sorted[i];
+
+		// Classify direction
+		double horizMag = sqrt(rec->lastE * rec->lastE + rec->lastN * rec->lastN);
+		double vertMag = fabs(rec->lastUp);
+		std::string direction;
+		if (vertMag < 0.01 * horizMag)
+			direction = "horizontal";
+		else if (horizMag < 0.01 * vertMag)
+			direction = "vertical";
+		else
+			direction = "3D";
+
+		std::cout << "  - " << bstBinaryRecords_.at(rec->stnBstIdx).stationName
+			<< std::fixed << std::setprecision(1)
+			<< " — " << rec->firstMag << "m to " << rec->lastMag << "m"
+			<< ", " << direction
+			<< ", " << rec->maxCycles << " cycles"
+			<< " (iterations " << rec->firstIteration << "-" << rec->lastIteration << ")"
+			<< std::endl;
+	}
+}
+
 void dna_adjust::ComputePrecisionAdjMsrs(const UINT32& block /*= 0*/)
 {
 	if (projectSettings_.a.report_mode)
@@ -7352,6 +8235,7 @@ void dna_adjust::ComputePrecisionAdjMsrs(const UINT32& block /*= 0*/)
 	// A*V-1*At, where:
 	//   - A is design matrix
 	//   - V is the inverse of the normals (i.e. precision of estimates)
+	v_precAdjMsrsFull_.at(block).redim(v_measurementVarianceCount_.at(block), 1);
 	v_precAdjMsrsFull_.at(block).zero();
 
 	UINT32 design_row(0);
@@ -8031,19 +8915,19 @@ void dna_adjust::ComputeChiSquare_D(it_vmsr_t& _it_msr, UINT32& measurement_inde
 }
 	
 
-void dna_adjust::FormInverseVarianceMatrix(matrix_2d* vmat, bool LOWER_IS_CLEARED)
+void dna_adjust::FormInverseVarianceMatrix(matrix_2d* vmat, bool LOWER_IS_CLEARED, bool mark_symmetric)
 {
 	if (vmat->rows() == 1)
 	{
 		vmat->put(0, 0, 1./vmat->get(0, 0));
 		return;
 	}
-	
+
 	// As of version 3.2.0, force all inversions to use MKL.  This change
 	// is enforced for two reasons:
 	// 1. Sweep, Gaussian inverse and numerical recipes cholesky
-	//    all require matrix data to be stored in row wise fashion, upper 
-	//    triangle only, whereas contiguous matrix class stores matrix data 
+	//    all require matrix data to be stored in row wise fashion, upper
+	//    triangle only, whereas contiguous matrix class stores matrix data
 	//    in column wise fashion, lower triangle.
 	// 2. Sweep, Gaussian never really offered a stable solution.
 	// The following switch is kept in case future development warrants
@@ -8053,8 +8937,8 @@ void dna_adjust::FormInverseVarianceMatrix(matrix_2d* vmat, bool LOWER_IS_CLEARE
 
 	// TODO: All functions which load variance matrix from binary files
 	// store the data in upper triangular form.  This could be changed to
-	// lower triangular form, thus alleviating the need to pass 
-	// LOWER_IS_CLEARED.  That is, force all operations to use a lower 
+	// lower triangular form, thus alleviating the need to pass
+	// LOWER_IS_CLEARED.  That is, force all operations to use a lower
 	// triangular matrix.  Not sure if this would create an efficiency or not.
 	switch (projectSettings_.a.inverse_method_msr)
 	{
@@ -8066,8 +8950,7 @@ void dna_adjust::FormInverseVarianceMatrix(matrix_2d* vmat, bool LOWER_IS_CLEARE
 //		break;
 	case Cholesky_mkl:
 	default:
-		// Inversion using Intel MKL
-		vmat->cholesky_inverse(LOWER_IS_CLEARED);
+		vmat->cholesky_inverse(LOWER_IS_CLEARED, mark_symmetric);
 		break;
 	// choleskyinverse broke once the storage order of the matrix buffer was
 	// changed from row-wise to column wise.
diff --git a/dynadjust/dynadjust/dnaadjust/dnaadjust.hpp b/dynadjust/dynadjust/dnaadjust/dnaadjust.hpp
index c3c098131..f5f9be546 100644
--- a/dynadjust/dynadjust/dnaadjust/dnaadjust.hpp
+++ b/dynadjust/dynadjust/dnaadjust/dnaadjust.hpp
@@ -33,6 +33,7 @@
 #include <exception>
 #include <fstream>
 #include <iostream>
+#include <map>
 #include <math.h>
 #include <queue>
 #include <stdexcept>
@@ -289,16 +290,24 @@ class dna_adjust {
     // void PrintEstimatedStationCoordinatestoDNAXML_Y(const std::string& msrFile,
     //                                                 INPUT_FILE_TYPE t);
 
+    void PrintOscillationSummary();
     void CloseOutputFiles();
     void UpdateBinaryFiles();
 
     UINT32 CurrentIteration() const;
     UINT32& incrementIteration();
+    void decrementIteration();
     void initialiseIteration(const UINT32& iteration = 0);
 
     inline UINT32 CurrentBlock() const { return currentBlock_; };
 
-    inline void SetcurrentBlock(const UINT32 b) { currentBlock_ = b; };
+    inline void SetcurrentBlock(const UINT32 b) {
+        auto now = std::chrono::steady_clock::now();
+        if (b != currentBlock_ && blockStartTime_.time_since_epoch().count() > 0)
+            lastBlockElapsedMs_ = std::chrono::duration_cast<std::chrono::milliseconds>(now - blockStartTime_).count();
+        currentBlock_ = b;
+        blockStartTime_ = now;
+    };
 
     inline void SetmaxCorr(const double c) {
 
@@ -312,6 +321,15 @@ class dna_adjust {
     }
     inline bool processingForward() { return forward_; }
     inline bool processingCombine() { return isCombining_; }
+    inline int64_t LastBlockElapsedMs() const { return lastBlockElapsedMs_; }
+    inline UINT32 CurrentBlockStationCount() const {
+        if (currentBlock_ < v_blockStationsMap_.size())
+            return static_cast<UINT32>(v_blockStationsMap_.at(currentBlock_).size());
+        return 0;
+    }
+    static void SetMaxBlasThreads(int n);
+    static int GetMaxBlasThreads();
+
     inline int GetDegreesOfFreedom() const { return degreesofFreedom_; }
     inline UINT32 GetMeasurementCount() const { return measurementParams_; }
     inline UINT32 GetUnknownsCount() const { return unknownParams_; }
@@ -387,12 +405,15 @@ class dna_adjust {
     bool isAdjusting_;
     bool isCombining_;
     bool forward_;
+    bool rebuildingDesign_;
     bool isFirstTimeAdjustment_;
     bool isIterationComplete_;
     bool isAdjustmentQuestionable_;
 
     UINT32 blockCount_;
     UINT32 currentBlock_;
+    std::chrono::steady_clock::time_point blockStartTime_;
+    int64_t lastBlockElapsedMs_;
 
     std::chrono::milliseconds total_time_;
     _ADJUST_STATUS_ adjustStatus_;
@@ -506,6 +527,8 @@ class dna_adjust {
     void DeserialiseBlockFromMappedFile(const UINT32& block,
                                         const int count = 0, ...);
     void UnloadBlock(const UINT32& block, const int file_count = 0, ...);
+    void AdviseBlockDontNeed(const UINT32& block);
+    void AdviseBlockWillNeed(const UINT32& block);
     void PurgeMatricesFromDisk();
 
     // Helpers
@@ -541,7 +564,8 @@ class dna_adjust {
     void
     UpdateDesignNormalMeasMatrices(pit_vmsr_t _it_msr, UINT32& design_row,
                                    bool buildnewMatrices, const UINT32& block,
-                                   bool MT_ReverseOrCombine);
+                                   bool MT_ReverseOrCombine,
+                                   bool skipNormals = false);
 
     void
     UpdateDesignNormalMeasMatrices_A(pit_vmsr_t _it_msr, UINT32& design_row,
@@ -549,40 +573,45 @@ class dna_adjust {
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_BK(pit_vmsr_t _it_msr, UINT32& design_row,
                                       const UINT32& block,
                                       matrix_2d* measMinusComp,
                                       matrix_2d* estimatedStations,
                                       matrix_2d* normals, matrix_2d* design,
-                                      matrix_2d* AtVinv, bool buildnewMatrices);
+                                      matrix_2d* AtVinv, bool buildnewMatrices,
+                                      bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_C(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void UpdateDesignNormalMeasMatrices_CEM(
         pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
         matrix_2d* measMinusComp, matrix_2d* estimatedStations,
         matrix_2d* normals, matrix_2d* design, matrix_2d* AtVinv,
-        bool buildnewMatrices);
+        bool buildnewMatrices, bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_D(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_E(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void UpdateDesignMeasMatrices_GX(pit_vmsr_t _it_msr, UINT32& design_row,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
@@ -594,113 +623,130 @@ class dna_adjust {
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_H(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_HR(pit_vmsr_t _it_msr, UINT32& design_row,
                                       const UINT32& block,
                                       matrix_2d* measMinusComp,
                                       matrix_2d* estimatedStations,
                                       matrix_2d* normals, matrix_2d* design,
-                                      matrix_2d* AtVinv, bool buildnewMatrices);
+                                      matrix_2d* AtVinv, bool buildnewMatrices,
+                                      bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_I(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_IP(pit_vmsr_t _it_msr, UINT32& design_row,
                                       const UINT32& block,
                                       matrix_2d* measMinusComp,
                                       matrix_2d* estimatedStations,
                                       matrix_2d* normals, matrix_2d* design,
-                                      matrix_2d* AtVinv, bool buildnewMatrices);
+                                      matrix_2d* AtVinv, bool buildnewMatrices,
+                                      bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_J(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_JQ(pit_vmsr_t _it_msr, UINT32& design_row,
                                       const UINT32& block,
                                       matrix_2d* measMinusComp,
                                       matrix_2d* estimatedStations,
                                       matrix_2d* normals, matrix_2d* design,
-                                      matrix_2d* AtVinv, bool buildnewMatrices);
+                                      matrix_2d* AtVinv, bool buildnewMatrices,
+                                      bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_L(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_M(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_P(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_Q(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_R(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_S(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_V(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
     void UpdateDesignNormalMeasMatrices_X(
         pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
         matrix_2d* measMinusComp, matrix_2d* estimatedStations,
-        matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices);
+        matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices,
+        bool skipNormals = false);
     void UpdateDesignNormalMeasMatrices_Y(
         pit_vmsr_t _it_msr, UINT32& design_row, const UINT32& block,
         matrix_2d* measMinusComp, matrix_2d* estimatedStations,
-        matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices);
+        matrix_2d* design, matrix_2d* AtVinv, bool buildnewMatrices,
+        bool skipNormals = false);
     void
     UpdateDesignNormalMeasMatrices_Z(pit_vmsr_t _it_msr, UINT32& design_row,
                                      const UINT32& block,
                                      matrix_2d* measMinusComp,
                                      matrix_2d* estimatedStations,
                                      matrix_2d* normals, matrix_2d* design,
-                                     matrix_2d* AtVinv, bool buildnewMatrices);
+                                     matrix_2d* AtVinv, bool buildnewMatrices,
+                                     bool skipNormals = false);
 
     void UpdateIgnoredMeasurements(pit_vmsr_t _it_msr,
                                    bool storeOriginalMeasurement);
@@ -834,7 +880,9 @@ class dna_adjust {
     void PrepareDesignAndMsrMnsCmpMatricesStage(const UINT32& block);
     void FillDesignNormalMeasurementsMatrices(bool buildnewMatrices,
                                               const UINT32& block,
-                                              bool MT_ReverseOrCombine);
+                                              bool MT_ReverseOrCombine,
+                                              bool skipNormals = false);
+    void RebuildDesignAndAtVinv(const UINT32& block);
 
     // void RecomputeMeasurementsCommonJunctions(const UINT32& nextBlock, const
     // UINT32& thisBlock, const UINT32& prevBlock);
@@ -936,7 +984,7 @@ class dna_adjust {
                         matrix_2d* measMinusComp);
 
     void
-    FormInverseVarianceMatrix(matrix_2d* vmat, bool LOWER_IS_CLEARED = false);
+    FormInverseVarianceMatrix(matrix_2d* vmat, bool LOWER_IS_CLEARED = false, bool mark_symmetric = false);
     void
     FormInverseGPSVarianceMatrix(const it_vmsr_t& _it_msr, matrix_2d* vmat);
     bool
@@ -1213,6 +1261,56 @@ class dna_adjust {
     v_mat_2d v_corrections_;          // vector of residuals matrices
     v_mat_2d v_correctionsR_;         // vector of residuals matrices
 
+    // ----------------------------------------------
+    // Levenberg-Marquardt / Trust-Region state
+    double lm_lambda_;                         // current damping parameter
+    double chiSquaredPrev_;                    // chi-squared before step
+    v_mat_2d v_normalsBackup_;                 // pristine N per block (before lambda*D augmentation)
+    v_mat_2d v_gradient_;                      // g = AtV^-1 r per block
+    v_mat_2d v_estimatedStationsBackup_;       // x_k snapshot for rollback
+	std::vector<std::vector<double>> v_scaleDiag_;	// saved scaling vectors per block
+
+    void InitialiseLMState();
+    void SolveLM(bool COMPUTE_FACTOR, const UINT32& block, bool COMPUTE_INVERSE = false);
+    void SolveLMTry(bool COMPUTE_FACTOR, const UINT32& block, bool COMPUTE_INVERSE = false);
+    double ComputePredictedReduction(const UINT32& block);
+    double ComputeNetworkPredictedReduction();
+    void AdjustSimultaneousLM();
+    void AdjustPhasedLM();
+    void AdjustPhasedMultiThreadLM();
+
+    // ----------------------------------------------
+    // Anderson acceleration state
+    std::vector<std::vector<double>> aa_G_hist_;   // ring buffer of g_k (post-iteration coords)
+    std::vector<std::vector<double>> aa_F_hist_;   // ring buffer of f_k (residuals)
+    std::vector<double> aa_x_snapshot_;            // x_k saved before iteration
+    UINT32 aa_hist_count_;                         // entries stored so far
+    UINT32 aa_hist_pos_;                           // current write position in ring buffer
+
+    void InitialiseAndersonAcceleration();
+    void ExtractStationCoordinates(std::vector<double>& coords);
+    void ScatterStationCoordinates(const std::vector<double>& coords);
+    void ApplyAndersonAcceleration(UINT32 iteration);
+
+    // ----------------------------------------------
+    // Iteration diagnostics — oscillation detection
+    struct StationCorrRecord {
+        double cx, cy, cz;         // Cartesian corrections
+    };
+    struct OscillationRecord {
+        UINT32 stnBstIdx;          // BST record index
+        UINT32 firstIteration;
+        UINT32 lastIteration;
+        UINT32 maxCycles;
+        double firstMag;
+        double lastMag;
+        double lastE, lastN, lastUp;
+    };
+    std::map<UINT32, StationCorrRecord> corrPrev_;    // keyed by BST station index
+    std::map<UINT32, UINT32> stnOscCount_;            // keyed by BST station index
+    std::map<UINT32, OscillationRecord> oscHistory_;  // keyed by BST station index
+    void UpdateIterationDiagnostics();
+
     // ----------------------------------------------
     // Adjustment functions and variables for staged adjustment
 
diff --git a/dynadjust/dynadjust/dnaadjustwrapper/dnaadjustprogress.cpp b/dynadjust/dynadjust/dnaadjustwrapper/dnaadjustprogress.cpp
index 7365c622d..e97b27f52 100644
--- a/dynadjust/dynadjust/dnaadjustwrapper/dnaadjustprogress.cpp
+++ b/dynadjust/dynadjust/dnaadjustwrapper/dnaadjustprogress.cpp
@@ -21,6 +21,7 @@
 
 #include <dynadjust/dnaadjustwrapper/dnaadjustprogress.hpp>
 #include <dynadjust/dnaadjust/dnaadjust.hpp>
+#include <include/math/dnamatrix_contiguous.hpp>
 
 /// \cond
 #include <time.h>
@@ -366,24 +367,28 @@ void dna_adjust_progress_thread::processAdjustment()
 					
 			// print new block to screen when adjusting only
 			if (block != currentBlock && _dnaAdj->IsAdjusting())
-			{						
+			{
 				ss.str("");
 				ss << "  Iteration " << std::right << std::setw(2) << std::fixed << std::setprecision(0) << _dnaAdj->CurrentIteration();
 
 				if (_p->a.multi_thread && !_dnaAdj->processingCombine())
 					ss << std::left << std::setw(13) << ", adjusting...";
 				else
-					ss << ", block " << std::left << std::setw(6) << std::fixed << std::setprecision(0) << _dnaAdj->CurrentBlock() + 1;
-						
-				sst.str("");
-				if (first_time)
 				{
-					sst << std::setw(PROGRESS_ADJ_BLOCK_28) << std::left << " ";
-					first_time = false;
+					ss << ", block " << std::left << std::setw(4) << std::fixed << std::setprecision(0) << _dnaAdj->CurrentBlock() + 1;
+					UINT32 stnCount = _dnaAdj->CurrentBlockStationCount();
+					if (stnCount > 0)
+					{
+						int max_t = dna_adjust::GetMaxBlasThreads();
+						ss << " (" << std::right << std::setw(5) << stnCount << " stns, " << std::setw(2) << max_t << "T)";
+					}
+					int64_t elapsedMs = _dnaAdj->LastBlockElapsedMs();
+					if (elapsedMs > 0)
+						ss << "  " << std::fixed << std::setprecision(1) << (elapsedMs / 1000.0) << "s";
 				}
-				
-				sst << PROGRESS_BACKSPACE_28 << std::setw(PROGRESS_ADJ_BLOCK_28) << std::left << ss.str();
-				coutMessage(sst.str());
+				ss << std::endl;
+				coutMessage(ss.str());
+				first_time = true;
 
 				currentBlock = block;
 			}
diff --git a/dynadjust/dynadjust/dnaadjustwrapper/dnaadjustwrapper.cpp b/dynadjust/dynadjust/dnaadjustwrapper/dnaadjustwrapper.cpp
index 48920196d..40b47f12a 100644
--- a/dynadjust/dynadjust/dnaadjustwrapper/dnaadjustwrapper.cpp
+++ b/dynadjust/dynadjust/dnaadjustwrapper/dnaadjustwrapper.cpp
@@ -30,6 +30,7 @@
 #include <memory>
 #include <sstream>
 #include <string>
+#include <csignal>
 #include <thread>
 #include <vector>
 #include <boost/program_options/options_description.hpp>
@@ -38,6 +39,7 @@
 /// \endcond
 
 #include <include/config/dnaoptions-interface.hpp>
+#include <include/math/dnamatrix_contiguous.hpp>
 #include <include/config/dnaprojectfile.hpp>
 #include <include/config/dnatypes-gui.hpp>
 #include <include/functions/dnafilepathfuncs.hpp>
@@ -60,6 +62,14 @@ using namespace dynadjust::iostreams;
 extern bool running;
 extern std::mutex cout_mutex;
 
+static dna_adjust* g_netAdjust = nullptr;
+
+void sigint_handler(int)
+{
+	if (g_netAdjust)
+		g_netAdjust->CancelAdjustment();
+}
+
 using namespace dynadjust;
 using namespace dynadjust::epsg;
 
@@ -199,75 +209,75 @@ void DeserialiseVarianceMatrices(dna_adjust* netAdjust, const project_settings*
 void GenerateStatistics(dna_adjust* netAdjust, const project_settings* p)
 {
 	// Generate statistics
-	// Don't produce statistics only for block 1 only adjustments
-	if (p->a.adjust_mode != Phased_Block_1Mode)
+	if (!p->g.quiet)
 	{
-		if (!p->g.quiet)
+		std::cout << "+ Generating statistics...";
+		std::cout.flush();
+	}
+	netAdjust->GenerateStatistics();
+	if (!p->g.quiet)
+	{
+		std::cout << " done." << std::endl;
+
+		// Don't print detailed statistics for block 1 only adjustments
+		if (p->a.adjust_mode == Phased_Block_1Mode)
+			return;
+
+		std::cout << "+ Adjustment results:" << std::endl << std::endl;
+		std::cout << "+" << OUTPUTLINE << std::endl;
+		std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Number of unknown parameters" << std::fixed << std::setprecision(0) << netAdjust->GetUnknownsCount();
+		if (netAdjust->GetAllFixed())
+			std::cout << "  (All stations held constrained)";
+		std::cout << std::endl;
+
+		std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Number of measurements" << std::fixed << std::setprecision(0) << netAdjust->GetMeasurementCount();
+
+		if (netAdjust->GetPotentialOutlierCount() > 0)
 		{
-			std::cout << "+ Generating statistics...";
-			std::cout.flush();
+			std::cout << "  (" << netAdjust->GetPotentialOutlierCount() << " potential outlier";
+			if (netAdjust->GetPotentialOutlierCount() > 1)
+				std::cout << "s";
+			std::cout << ")";
 		}
-		netAdjust->GenerateStatistics();
-		if (!p->g.quiet)
-		{
-			std::cout << " done." << std::endl;
-
-			std::cout << "+ Adjustment results:" << std::endl << std::endl;
-			std::cout << "+" << OUTPUTLINE << std::endl;
-			std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Number of unknown parameters" << std::fixed << std::setprecision(0) << netAdjust->GetUnknownsCount();
-			if (netAdjust->GetAllFixed())
-				std::cout << "  (All stations held constrained)";
-			std::cout << std::endl;
+		std::cout << std::endl;
+		std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Degrees of freedom" << std::fixed << std::setprecision(0) << netAdjust->GetDegreesOfFreedom() << std::endl;
+		std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Chi squared" << std::fixed << std::setprecision(2) << netAdjust->GetChiSquared() << std::endl;
+		std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Rigorous sigma zero" << std::fixed << std::setprecision(3) << netAdjust->GetSigmaZero() << std::endl;
+		std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Global (Pelzer) Reliability" << std::fixed << std::setw(8) << std::setprecision(3) << netAdjust->GetGlobalPelzerRel() << "(excludes non redundant measurements)" << std::endl << std::endl;
 
-			std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Number of measurements" << std::fixed << std::setprecision(0) << netAdjust->GetMeasurementCount();
-			
-			if (netAdjust->GetPotentialOutlierCount() > 0)
-			{
-				std::cout << "  (" << netAdjust->GetPotentialOutlierCount() << " potential outlier";
-				if (netAdjust->GetPotentialOutlierCount() > 1)
-					std::cout << "s";
-				std::cout << ")";
-			}
-			std::cout << std::endl;
-			std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Degrees of freedom" << std::fixed << std::setprecision(0) << netAdjust->GetDegreesOfFreedom() << std::endl;
-			std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Chi squared" << std::fixed << std::setprecision(2) << netAdjust->GetChiSquared() << std::endl;
-			std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Rigorous sigma zero" << std::fixed << std::setprecision(3) << netAdjust->GetSigmaZero() << std::endl;
-			std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Global (Pelzer) Reliability" << std::fixed << std::setw(8) << std::setprecision(3) << netAdjust->GetGlobalPelzerRel() << "(excludes non redundant measurements)" << std::endl << std::endl;
-		
-			std::stringstream ss("");
-			ss << std::left << "  Chi-Square test (" << std::setprecision(1) << std::fixed << p->a.confidence_interval << "%)";
-			std::cout << std::setw(PRINT_VAR_PAD) << std::left << ss.str();
-			ss.str("");
-			ss << std::fixed << std::setprecision(3) << 
-				netAdjust->GetChiSquaredLowerLimit() << " < " << 
-				netAdjust->GetSigmaZero() << " < " <<
-				netAdjust->GetChiSquaredUpperLimit();
-			std::cout << std::setw(CHISQRLIMITS) << std::left << ss.str();
-			ss.str("");
-
-			if (netAdjust->GetDegreesOfFreedom() < 1)
-				ss << "NO REDUNDANCY";
-			else
+		std::stringstream ss("");
+		ss << std::left << "  Chi-Square test (" << std::setprecision(1) << std::fixed << p->a.confidence_interval << "%)";
+		std::cout << std::setw(PRINT_VAR_PAD) << std::left << ss.str();
+		ss.str("");
+		ss << std::fixed << std::setprecision(3) <<
+			netAdjust->GetChiSquaredLowerLimit() << " < " <<
+			netAdjust->GetSigmaZero() << " < " <<
+			netAdjust->GetChiSquaredUpperLimit();
+		std::cout << std::setw(CHISQRLIMITS) << std::left << ss.str();
+		ss.str("");
+
+		if (netAdjust->GetDegreesOfFreedom() < 1)
+			ss << "NO REDUNDANCY";
+		else
+		{
+			ss << "*** ";
+			switch (netAdjust->GetTestResult())
 			{
-				ss << "*** ";
-				switch (netAdjust->GetTestResult())
-				{
-				case test_stat_pass: 
-					ss << "PASSED";		// within upper and lower
-					break;
-				case test_stat_warning:
-					ss << "WARNING";	// less than lower limit
-					break;
-				case test_stat_fail:
-					ss << "FAILED";		// greater than upper limit
-					break;
-				}
-				ss << " ***";
+			case test_stat_pass:
+				ss << "PASSED";		// within upper and lower
+				break;
+			case test_stat_warning:
+				ss << "WARNING";	// less than lower limit
+				break;
+			case test_stat_fail:
+				ss << "FAILED";		// greater than upper limit
+				break;
 			}
-
-			std::cout << std::setw(PASS_FAIL) << std::right << ss.str() << std::endl;	
-			std::cout << "+" << OUTPUTLINE << std::endl << std::endl;
+			ss << " ***";
 		}
+
+		std::cout << std::setw(PASS_FAIL) << std::right << ss.str() << std::endl;
+		std::cout << "+" << OUTPUTLINE << std::endl << std::endl;
 	}
 	else
 		std::cout << std::endl;
@@ -617,6 +627,15 @@ int ParseCommandLineOptions(const int& argc, char* argv[], const boost::program_
 	//	p.a.inverse_method_msr = p.a.inverse_method_lsq;
 	if (vm.count(SCALE_NORMAL_UNITY))
 		p.a.scale_normals_to_unity = 1;
+	if (vm.count(LM_ENABLED))
+		p.a.lm_enabled = true;
+	if (vm.count(AA_ENABLED))
+		p.a.aa_enabled = true;
+	if (p.a.aa_enabled && p.a.lm_enabled)
+	{
+		std::cout << "Warning: --anderson-acceleration is for Gauss-Newton only; disabled because --lm-enabled is set." << std::endl;
+		p.a.aa_enabled = false;
+	}
 	if (vm.count(OUTPUT_ADJ_MSR_TSTAT))
 		p.o._adj_msr_tstat = 1;
 	if (vm.count(OUTPUT_ADJ_MSR_DBID))
@@ -861,6 +880,31 @@ int main(int argc, char* argv[])
 				"Type b uncertainties to be added to each computed uncertainty. arg is a comma delimited string that provides 1D, 2D or 3D uncertainties in the local reference frame (e.g. \"up\" or \"e,n\" or \"e,n,up\").")
 			(TYPE_B_FILE, boost::program_options::value<std::string>(&p.a.type_b_file),
 				"Type b uncertainties file name. Full path to a file containing Type b uncertainties to be added to the computed uncertainty for specific sites.")
+			(LM_ENABLED,
+				"Enable Levenberg-Marquardt / trust-region solver.")
+			(LM_LAMBDA_INIT, boost::program_options::value<double>(&p.a.lm_lambda_init),
+				(std::string("Initial LM damping parameter. Default is ")+
+				StringFromT(p.a.lm_lambda_init)+std::string(".")).c_str())
+			(LM_ETA_GOOD, boost::program_options::value<double>(&p.a.lm_eta_good),
+				(std::string("Good-step threshold for lambda reduction. Default is ")+
+				StringFromT(p.a.lm_eta_good)+std::string(".")).c_str())
+			(LM_ETA_ACCEPT, boost::program_options::value<double>(&p.a.lm_eta_accept),
+				(std::string("Minimum acceptable gain ratio. Default is ")+
+				StringFromT(p.a.lm_eta_accept)+std::string(".")).c_str())
+			(LM_GAMMA_UP, boost::program_options::value<double>(&p.a.lm_gamma_up),
+				(std::string("Lambda increase factor on step rejection. Default is ")+
+				StringFromT(p.a.lm_gamma_up)+std::string(".")).c_str())
+			(LM_GAMMA_DOWN, boost::program_options::value<double>(&p.a.lm_gamma_down),
+				(std::string("Lambda decrease factor on good step. Default is ")+
+				StringFromT(p.a.lm_gamma_down)+std::string(".")).c_str())
+			(LM_MAX_REJECTS, boost::program_options::value<UINT32>(&p.a.lm_max_rejects),
+				(std::string("Maximum consecutive LM rejections per iteration. Default is ")+
+				StringFromT(p.a.lm_max_rejects)+std::string(".")).c_str())
+			(AA_ENABLED,
+				"Enable Anderson acceleration for phased Gauss-Newton adjustment.")
+			(AA_DEPTH, boost::program_options::value<UINT32>(&p.a.aa_depth),
+				(std::string("Anderson acceleration history depth. Default is ")+
+				StringFromT(p.a.aa_depth)+std::string(".")).c_str())
 			;
 
 		staged_adj_options.add_options()
@@ -868,6 +912,10 @@ int main(int argc, char* argv[])
 				"Recreate memory mapped files.")
 			(PURGE_STAGE_FILES,
 				"Purge memory mapped files from disk upon adjustment completion.")
+			(STAGE_PATH, boost::program_options::value<std::string>(&p.a.stage_path),
+				"Directory for memory mapped stage files. Default is the output folder.")
+			(MAX_THREADS, boost::program_options::value<int>(&p.a.max_threads),
+				"Maximum number of BLAS threads for linear algebra operations. Default is 0 (auto).")
 			;
 
 		output_options.add_options()
@@ -1166,6 +1214,12 @@ int main(int argc, char* argv[])
 
 		if (p.a.scale_normals_to_unity)
 			std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Scale normals to unity: " << "yes" << std::endl;
+		if (p.a.lm_enabled)
+		{
+			std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  LM solver: " << "enabled" << std::endl;
+			std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  LM lambda init: " << p.a.lm_lambda_init << std::endl;
+			std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  LM eta good/accept: " << p.a.lm_eta_good << " / " << p.a.lm_eta_accept << std::endl;
+		}
 		if (!p.a.station_constraints.empty())
 		{
 			std::cout << std::setw(PRINT_VAR_PAD) << std::left << "  Station constraints: " << p.a.station_constraints << std::endl;
@@ -1328,7 +1382,12 @@ int main(int argc, char* argv[])
 	try {
 		running = true;
 
-        int nthreads_la = init_linear_algebra_threads();
+		// Install SIGINT handler for graceful cancellation
+		g_netAdjust = &netAdjust;
+		std::signal(SIGINT, sigint_handler);
+
+        int nthreads_la = init_linear_algebra_threads(p.a.max_threads);
+        dna_adjust::SetMaxBlasThreads(nthreads_la);
         std::thread progress(dna_adjust_progress_thread(&netAdjust, &p));
 
         // Do adjustment using linear algebra threads
@@ -1355,7 +1414,10 @@ int main(int argc, char* argv[])
 		PrintSummaryMessage(&netAdjust, &p, &elapsed_time);
 
 		if (netAdjust.GetStatus() > ADJUST_THRESHOLD_EXCEEDED)
+		{
+			netAdjust.PrintOscillationSummary();
 			return ADJUST_SUCCESS;
+		}
 
 		// Generate statistics
 		GenerateStatistics(&netAdjust, &p);
@@ -1402,6 +1464,8 @@ int main(int argc, char* argv[])
 		return EXIT_FAILURE;
 	}
 
+	netAdjust.PrintOscillationSummary();
+
 	if (!p.g.quiet)
 		std::cout << std::endl << "+ Open " << leafStr<std::string>(p.o._adj_file) << " to view the adjustment details." << std::endl << std::endl;
 	
diff --git a/dynadjust/include/config/dnaoptions-interface.hpp b/dynadjust/include/config/dnaoptions-interface.hpp
index d80b81615..4d3826a5e 100644
--- a/dynadjust/include/config/dnaoptions-interface.hpp
+++ b/dynadjust/include/config/dnaoptions-interface.hpp
@@ -151,8 +151,25 @@ const char* const LSQ_INVERSE_METHOD = "inversion-method";
 const char* const SCALE_NORMAL_UNITY = "scale-normals-to-unity";
 const char* const PURGE_STAGE_FILES = "purge-stage-files";
 const char* const RECREATE_STAGE_FILES = "create-stage-files";
+const char* const STAGE_PATH = "stage-path";
 const char* const UPDATE_ORIGINAL_STN_FILE = "update-orig-stn-file";
 
+// Levenberg-Marquardt / Trust-Region options
+const char* const LM_ENABLED = "lm-enabled";
+const char* const LM_LAMBDA_INIT = "lm-lambda-init";
+const char* const LM_ETA_GOOD = "lm-eta-good";
+const char* const LM_ETA_ACCEPT = "lm-eta-accept";
+const char* const LM_GAMMA_UP = "lm-gamma-up";
+const char* const LM_GAMMA_DOWN = "lm-gamma-down";
+const char* const LM_MAX_REJECTS = "lm-max-rejects";
+
+// Anderson acceleration options
+const char* const AA_ENABLED = "anderson-acceleration";
+const char* const AA_DEPTH = "aa-depth";
+
+// Threading options
+const char* const MAX_THREADS = "max-threads";
+
 const char* const SEG_MIN_INNER_STNS = "min-inner-stns";
 const char* const SEG_THRESHOLD_STNS = "max-block-stns";
 const char* const SEG_STARTING_STN = "starting-stns";
diff --git a/dynadjust/include/config/dnaoptions.hpp b/dynadjust/include/config/dnaoptions.hpp
index 77e21eeae..46ce79572 100644
--- a/dynadjust/include/config/dnaoptions.hpp
+++ b/dynadjust/include/config/dnaoptions.hpp
@@ -430,7 +430,13 @@ struct adjust_settings : private boost::equality_comparable<adjust_settings> {
 		, iteration_threshold((float)0.0005), free_std_dev(10.0), fixed_std_dev(PRECISION_1E6), station_constraints("")
 		, map_file(""), bst_file(""), bms_file(""), seg_file(""), comments("") 
 		, command_line_arguments("")
-		, type_b_global (""), type_b_file ("") {}
+		, type_b_global (""), type_b_file ("")
+		, lm_enabled(false), lm_lambda_init(1e-3)
+		, lm_eta_good(0.75), lm_eta_accept(0.25)
+		, lm_gamma_up(2.0), lm_gamma_down(3.0)
+		, lm_max_rejects(10)
+		, aa_enabled(false), aa_depth(5)
+		, max_threads(0) {}
 
 private:
 	// Disallow use of compiler generated equality operator.
@@ -471,6 +477,7 @@ struct adjust_settings : private boost::equality_comparable<adjust_settings> {
 	UINT16		scale_normals_to_unity;	// Scale normals to unity prior to inversion
 	bool		purge_stage_files;		// Purge memory mapped files from disk upon adjustment completion.
 	UINT16		recreate_stage_files;	// Recreate memory mapped files.
+	std::string	stage_path;				// Directory for memory mapped stage files (default: output_folder).
 	float		iteration_threshold;	// Convergence limit
 	double		free_std_dev;			// SD for free stations
 	double		fixed_std_dev;			// SD for fixed stations
@@ -483,6 +490,22 @@ struct adjust_settings : private boost::equality_comparable<adjust_settings> {
 	std::string		command_line_arguments;
 	std::string      type_b_global;          // Comma delimited string containing Type b uncertainties to be applied to all uncertainties computed from an adjustment
 	std::string      type_b_file;            // File path to Type b uncertainties to be applied to specific site uncertainties computed from an adjustment
+
+	// Levenberg-Marquardt / Trust-Region settings
+	bool		lm_enabled;				// Enable LM/trust-region solver
+	double		lm_lambda_init;			// Initial LM damping parameter
+	double		lm_eta_good;			// Good-step threshold for lambda reduction
+	double		lm_eta_accept;			// Minimum acceptable gain ratio
+	double		lm_gamma_up;			// Lambda increase factor on reject/mediocre
+	double		lm_gamma_down;			// Lambda decrease factor on good step
+	UINT32		lm_max_rejects;			// Max consecutive rejections per iteration
+
+	// Anderson acceleration settings
+	bool		aa_enabled;				// Enable Anderson acceleration for phased GN
+	UINT32		aa_depth;				// Anderson acceleration history depth (default 5)
+
+	// Threading
+	int			max_threads;			// Maximum BLAS threads (0 = auto)
 };
 
 // datum and geoid settings
diff --git a/dynadjust/include/functions/dnatemplatematrixfuncs.hpp b/dynadjust/include/functions/dnatemplatematrixfuncs.hpp
index 05e0df853..092c7d896 100644
--- a/dynadjust/include/functions/dnatemplatematrixfuncs.hpp
+++ b/dynadjust/include/functions/dnatemplatematrixfuncs.hpp
@@ -56,7 +56,6 @@ void GetDirectionsVarianceMatrix(msr_t_Iterator begin, matrix_2d* vmat)
 	UINT32 a, angle_count(bmsRecord->vectorCount2 - 1);		// number of directions excluding the RO
 	UINT32 skip(0), ignored(bmsRecord->vectorCount1 - bmsRecord->vectorCount2);
 
-	vmat->zero();
 	vmat->redim(angle_count, angle_count);
 
 	bmsRecord++;
@@ -90,8 +89,7 @@ void GetGPSVarianceMatrix(const msr_t_Iterator begin, matrix_2d* vmat)
 {
 	msr_t_Iterator bmsRecord(begin);
 	UINT32 variance_dim(bmsRecord->vectorCount1 * 3), covariance_dim, cov;
-	vmat->zero();
-	vmat->redim(variance_dim, variance_dim);	
+	vmat->redim(variance_dim, variance_dim);
 
 	for (UINT32 var(0), cov_elem; var<variance_dim; var+=3)
 	{
diff --git a/dynadjust/include/math/dnamatrix_contiguous.cpp b/dynadjust/include/math/dnamatrix_contiguous.cpp
index 2d66cf738..a8ce3e45e 100644
--- a/dynadjust/include/math/dnamatrix_contiguous.cpp
+++ b/dynadjust/include/math/dnamatrix_contiguous.cpp
@@ -19,7 +19,9 @@
 // Description  : DynAdjust Matrix library
 //============================================================================
 
+#include <algorithm>
 #include <cmath>
+#include <cstdlib>
 #include <vector>
 #include <include/ide/trace.hpp>
 #include <include/math/dnamatrix_contiguous.hpp>
@@ -29,6 +31,22 @@
 namespace dynadjust {
 namespace math {
 
+// Maximum BLAS thread count, set via set_max_blas_threads().
+static int g_max_blas_threads = 0;
+
+void set_max_blas_threads(int n) { g_max_blas_threads = n; }
+int get_max_blas_threads() { return g_max_blas_threads; }
+
+// Compute BLAS thread count scaled proportionally to g_max_blas_threads.
+// Tiers: max, max/2, max/4, 1 — selected by matrix dimension n.
+static inline int blas_threads_for(lapack_int n) {
+    int max_t = g_max_blas_threads;
+    if (max_t <= 1) return 1;
+    if (n > 8000) return max_t;
+    if (n > 4000) return std::max(1, max_t / 2);
+    if (n > 2000) return std::max(1, max_t / 4);
+    return 1;
+}
 
 std::ostream& operator<<(std::ostream& os, const matrix_2d& rhs) {
     if (os.iword(0) == binary) {
@@ -44,18 +62,25 @@ std::ostream& operator<<(std::ostream& os, const matrix_2d& rhs) {
         os.write(reinterpret_cast<const char*>(&rhs._mem_rows), sizeof(UINT32));
         os.write(reinterpret_cast<const char*>(&rhs._mem_cols), sizeof(UINT32));
 
+        // Alignment padding — keeps data region at 8-byte aligned offset (24 bytes)
+        // Must match WriteMappedFileRegion / ReadMappedFileRegion / AttachMappedFileRegion layout
+        const UINT32 pad = 0;
+        os.write(reinterpret_cast<const char*>(&pad), sizeof(UINT32));
+
         UINT32 c, r;
 
         switch (rhs._matrixType) {
         case mtx_lower:
-            // output lower triangular part of a square matrix
             if (rhs._mem_rows != rhs._mem_cols)
                 throw std::runtime_error("matrix_2d operator<< (): Matrix is not square.");
 
-            // print each column
-            for (c = 0; c < rhs._mem_cols; ++c)
-                os.write(reinterpret_cast<const char*>(rhs.getelementref(c, c)), (rhs._mem_rows - c) * sizeof(double));
-
+            if (rhs._packed) {
+                os.write(reinterpret_cast<const char*>(rhs._buffer),
+                         matrix_2d::packed_size(rhs._mem_rows) * sizeof(double));
+            } else {
+                for (c = 0; c < rhs._mem_cols; ++c)
+                    os.write(reinterpret_cast<const char*>(rhs.getelementref(c, c)), (rhs._mem_rows - c) * sizeof(double));
+            }
             break;
         case mtx_sparse: break;
         case mtx_full:
@@ -108,7 +133,7 @@ void out_of_memory_handler() {
 }
 
 matrix_2d::matrix_2d()
-    : _mem_cols(0), _mem_rows(0), _cols(0), _rows(0), _buffer(0), _maxvalCol(0), _maxvalRow(0), _matrixType(mtx_full) {
+    : _mem_cols(0), _mem_rows(0), _cols(0), _rows(0), _buffer(0), _owns_buffer(true), _maxvalCol(0), _maxvalRow(0), _matrixType(mtx_full), _symmetric(false), _packed(false) {
     std::set_new_handler(out_of_memory_handler);
 
     // if this class were to be modified to use templates, each
@@ -125,9 +150,12 @@ matrix_2d::matrix_2d(const UINT32& rows, const UINT32& columns)
       _cols(columns),
       _rows(rows),
       _buffer(0),
+      _owns_buffer(true),
       _maxvalCol(0),
       _maxvalRow(0),
-      _matrixType(mtx_full) {
+      _matrixType(mtx_full),
+      _symmetric(false),
+      _packed(false) {
     std::set_new_handler(out_of_memory_handler);
 
     allocate(_rows, _cols);
@@ -140,9 +168,12 @@ matrix_2d::matrix_2d(const UINT32& rows, const UINT32& columns, const double dat
       _cols(columns),
       _rows(rows),
       _buffer(0),
+      _owns_buffer(true),
       _maxvalCol(0),
       _maxvalRow(0),
-      _matrixType(matrix_type) {
+      _matrixType(matrix_type),
+      _symmetric(false),
+      _packed(false) {
     std::set_new_handler(out_of_memory_handler);
 
     std::stringstream ss;
@@ -197,17 +228,26 @@ matrix_2d::matrix_2d(const matrix_2d& newmat)
       _cols(newmat.columns()),
       _rows(newmat.rows()),
       _buffer(0),
+      _owns_buffer(true),
       _maxvalCol(newmat.maxvalueCol()),
       _maxvalRow(newmat.maxvalueRow()),
-      _matrixType(newmat.matrixType()) {
+      _matrixType(newmat.matrixType()),
+      _symmetric(newmat._symmetric),
+      _packed(newmat._packed) {
     std::set_new_handler(out_of_memory_handler);
 
-    allocate(_mem_rows, _mem_cols);
-
-    const double* ptr = newmat.getbuffer();
-
-    // copy buffer
-    memcpy(_buffer, ptr, newmat.buffersize());
+    if (_packed) {
+        std::size_t ps = packed_size(_mem_rows);
+        _buffer = static_cast<double*>(std::malloc(ps * sizeof(double)));
+        if (!_buffer) throw NetMemoryException("Insufficient memory for packed matrix copy.");
+        memcpy(_buffer, newmat.getbuffer(), ps * sizeof(double));
+    } else {
+        // Allocate without zeroing — memcpy immediately overwrites entire buffer
+        std::size_t total_size = static_cast<std::size_t>(_mem_rows) * _mem_cols;
+        _buffer = static_cast<double*>(std::malloc(total_size * sizeof(double)));
+        if (!_buffer) throw NetMemoryException("Insufficient memory for matrix copy.");
+        memcpy(_buffer, newmat.getbuffer(), total_size * sizeof(double));
+    }
 }
 
 matrix_2d::~matrix_2d() {
@@ -216,14 +256,21 @@ matrix_2d::~matrix_2d() {
 }
 
 std::size_t matrix_2d::get_size() {
+    // 8 UINT32s: matrixType, rows, cols, mem_rows, mem_cols, _pad, maxvalRow, maxvalCol
+    // The padding UINT32 ensures the data region starts at an 8-byte aligned offset (24 bytes)
+    // from the region base, enabling in-place mmap buffer attachment.
     size_t size =
-        (7 * sizeof(UINT32));  // UINT32 _matrixType, _mem_cols, _mem_rows, _cols, _rows, _maxvalRow, _maxvalCol
+        (8 * sizeof(UINT32));
 
     switch (_matrixType) {
     case mtx_lower: size += sumOfConsecutiveIntegers(_mem_rows) * sizeof(double); break;
     case mtx_sparse: break;
     case mtx_full:
-    default: size += buffersize();
+    default:
+        if (_packed)
+            size += packed_size(_mem_rows) * sizeof(double);
+        else
+            size += buffersize();
     }
     return size;
 }
@@ -241,7 +288,6 @@ void matrix_2d::ReadMappedFileRegion(void* addr) {
 
     switch (_matrixType) {
     case mtx_sparse:
-        // _mem_cols and _mem_rows equal _cols and _rows
         _mem_rows = _rows;
         _mem_cols = _cols;
         break;
@@ -250,10 +296,21 @@ void matrix_2d::ReadMappedFileRegion(void* addr) {
     default:
         _mem_rows = *data_U++;
         _mem_cols = *data_U++;
+        ++data_U;  // skip alignment padding UINT32
         break;
     }
 
-    allocate(_mem_rows, _mem_cols);
+    if (_matrixType == mtx_lower && _mem_rows == _mem_cols) {
+        deallocate();
+        _packed = true;
+        _symmetric = true;
+        std::size_t ps = packed_size(_mem_rows);
+        _buffer = static_cast<double*>(std::calloc(ps, sizeof(double)));
+        if (!_buffer) throw NetMemoryException("Insufficient memory for packed matrix read.");
+    } else {
+        _packed = false;
+        allocate(_mem_rows, _mem_cols);
+    }
 
     double* data_d;
     int* data_i;
@@ -292,15 +349,21 @@ void matrix_2d::ReadMappedFileRegion(void* addr) {
         return;
         break;
     case mtx_lower:
+        assert(_mem_rows == _mem_cols && "ReadMappedFileRegion(mtx_lower): matrix must be square");
         data_d = reinterpret_cast<double*>(data_U);
 
-        // read each column
-        for (c = 0; c < _mem_cols; ++c) {
-            memcpy(getelementref(c, c), data_d, (_mem_rows - c) * sizeof(double));
-            data_d += (_mem_rows - c);
+        if (_packed) {
+            std::size_t ps = packed_size(_mem_rows);
+            memcpy(_buffer, data_d, ps * sizeof(double));
+            data_d += ps;
+        } else {
+            for (c = 0; c < _mem_cols; ++c) {
+                memcpy(getbuffer(c, c), data_d, (_mem_rows - c) * sizeof(double));
+                data_d += (_mem_rows - c);
+            }
         }
 
-        fillupper();
+        _symmetric = true;
         break;
     case mtx_full:
     default:
@@ -342,9 +405,32 @@ void matrix_2d::WriteMappedFileRegion(void* addr) {
     default:
         *data_U++ = _mem_rows;
         *data_U++ = _mem_cols;
+        *data_U++ = 0;  // alignment padding
         break;
     }
 
+    // In-place mmap: data is already in the region, only update footer
+    if (!_owns_buffer) {
+        // Calculate footer position by skipping past data
+        double* data_d;
+        switch (_matrixType) {
+        case mtx_lower:
+            if (_packed)
+                data_d = reinterpret_cast<double*>(data_U) + packed_size(_mem_rows);
+            else
+                data_d = reinterpret_cast<double*>(data_U) + sumOfConsecutiveIntegers(_mem_rows);
+            break;
+        case mtx_full:
+        default:
+            data_d = reinterpret_cast<double*>(data_U) + static_cast<std::size_t>(_mem_rows) * _mem_cols;
+            break;
+        }
+        PUINT32 footer = reinterpret_cast<UINT32*>(data_d);
+        *footer++ = _maxvalRow;
+        *footer = _maxvalCol;
+        return;
+    }
+
     double* data_d;
     int* data_i;
 
@@ -384,10 +470,15 @@ void matrix_2d::WriteMappedFileRegion(void* addr) {
     case mtx_lower:
         data_d = reinterpret_cast<double*>(data_U);
 
-        // print each column
-        for (c = 0; c < _mem_cols; ++c) {
-            memcpy(data_d, getbuffer(c, c), (_mem_rows - c) * sizeof(double));
-            data_d += (_mem_rows - c);
+        if (_packed) {
+            std::size_t ps = packed_size(_mem_rows);
+            memcpy(data_d, _buffer, ps * sizeof(double));
+            data_d += ps;
+        } else {
+            for (c = 0; c < _mem_cols; ++c) {
+                memcpy(data_d, getbuffer(c, c), (_mem_rows - c) * sizeof(double));
+                data_d += (_mem_rows - c);
+            }
         }
         break;
     case mtx_full:
@@ -408,16 +499,111 @@ void matrix_2d::WriteMappedFileRegion(void* addr) {
     *data_U = _maxvalCol;
 }
 
-void matrix_2d::allocate() { allocate(_mem_rows, _mem_cols); }
 
-// creates memory for desired "memory size", not matrix dimensions
-void matrix_2d::allocate(const UINT32& rows, const UINT32& columns) {
-    //_method_ = "allocate";
+void matrix_2d::AttachMappedFileRegion(void* addr) {
+    // Read header metadata — same layout as ReadMappedFileRegion
+    PUINT32 data_U = reinterpret_cast<PUINT32>(addr);
+    _matrixType = *data_U++;
+    _rows = *data_U++;
+    _cols = *data_U++;
+
+    switch (_matrixType) {
+    case mtx_sparse:
+        _mem_rows = _rows;
+        _mem_cols = _cols;
+        // Sparse cannot use in-place — fall back to copy path
+        ReadMappedFileRegion(addr);
+        return;
+    case mtx_lower:
+    case mtx_full:
+    default:
+        _mem_rows = *data_U++;
+        _mem_cols = *data_U++;
+        ++data_U;  // skip alignment padding
+        break;
+    }
 
+    // Release any existing buffer
     deallocate();
 
-    // an exception will be thrown by out_of_memory_handler
-    // if memory cannot be allocated
+    // Point _buffer directly at the data region in the mmap
+    _buffer = reinterpret_cast<double*>(data_U);
+    _owns_buffer = false;
+
+    // Set packed/symmetric flags for lower-triangular matrices
+    if (_matrixType == mtx_lower && _mem_rows == _mem_cols) {
+        _packed = true;
+        _symmetric = true;
+    } else {
+        _packed = false;
+    }
+
+    // Read footer (maxvalRow, maxvalCol) from after the data
+    double* data_d;
+    switch (_matrixType) {
+    case mtx_lower:
+        data_d = _buffer + packed_size(_mem_rows);
+        break;
+    case mtx_full:
+    default:
+        data_d = _buffer + static_cast<std::size_t>(_mem_rows) * _mem_cols;
+        break;
+    }
+
+    PUINT32 footer = reinterpret_cast<UINT32*>(data_d);
+    _maxvalRow = *footer++;
+    _maxvalCol = *footer;
+}
+
+
+void matrix_2d::DetachMappedFileRegion(void* addr) {
+    if (_owns_buffer || _buffer == nullptr)
+        return;
+
+    // Write updated footer (maxvalRow, maxvalCol) to the mmap region.
+    // Header: 6 UINT32s (matrixType, rows, cols, mem_rows, mem_cols, pad) = 24 bytes.
+    // Skip past data to find the footer position.
+    double* data_d;
+    switch (_matrixType) {
+    case mtx_lower:
+        data_d = _buffer + packed_size(_mem_rows);
+        break;
+    case mtx_full:
+    default:
+        data_d = _buffer + static_cast<std::size_t>(_mem_rows) * _mem_cols;
+        break;
+    }
+
+    PUINT32 footer = reinterpret_cast<UINT32*>(data_d);
+    *footer++ = _maxvalRow;
+    *footer = _maxvalCol;
+
+    // Detach without freeing the mmap memory
+    _buffer = nullptr;
+    _owns_buffer = true;
+}
+
+
+void matrix_2d::allocate() {
+    if (_matrixType == mtx_lower && _mem_rows == _mem_cols && _mem_rows > 0) {
+        deallocate();
+        _packed = true;
+        _symmetric = true;
+        std::size_t ps = packed_size(_mem_rows);
+        _buffer = static_cast<double*>(std::calloc(ps, sizeof(double)));
+        if (!_buffer) {
+            std::stringstream ss;
+            ss << "Insufficient memory for a packed " << _mem_rows << " x " << _mem_rows << " matrix.";
+            throw NetMemoryException(ss.str());
+        }
+        return;
+    }
+    allocate(_mem_rows, _mem_cols);
+}
+
+// creates memory for desired "memory size", not matrix dimensions
+void matrix_2d::allocate(const UINT32& rows, const UINT32& columns) {
+    deallocate();
     buy(rows, columns, &_buffer);
 }
 
@@ -429,26 +615,27 @@ void matrix_2d::buy(const UINT32& rows, const UINT32& columns, double** mem_spac
     __row__ = rows;
     __col__ = columns;
 
-    // an exception will be thrown by out_of_memory_handler
-    // if memory cannot be allocated
+    // calloc returns zero-initialised memory.  For large allocations glibc
+    // satisfies calloc via mmap, whose anonymous pages are already zero-filled
+    // by the kernel — so calloc skips the redundant memset that was previously
+    // triggering page-fault storms (97 % of CPU time on the NSW benchmark).
     std::size_t total_size = static_cast<std::size_t>(rows) * static_cast<std::size_t>(columns);
-    (*mem_space) = new double[total_size];
+    (*mem_space) = static_cast<double*>(std::calloc(total_size, sizeof(double)));
 
     if ((*mem_space) == nullptr) {
         std::stringstream ss;
         ss << "Insufficient memory for a " << rows << " x " << columns << " matrix.";
         throw NetMemoryException(ss.str());
     }
-
-    // Initialize memory to zero to prevent uninitialized values
-    std::memset((*mem_space), 0, total_size * sizeof(double));
 }
 
 void matrix_2d::deallocate() {
     if (_buffer != nullptr) {
-        delete[] _buffer;
+        if (_owns_buffer)
+            std::free(_buffer);
         _buffer = nullptr;
     }
+    _owns_buffer = true;
 }
 
 matrix_2d matrix_2d::submatrix(const UINT32& row_begin, const UINT32& col_begin, const UINT32& rows,
@@ -514,9 +701,14 @@ void matrix_2d::submatrix(const UINT32& row_begin, const UINT32& col_begin, matr
 }
 
 void matrix_2d::redim(const UINT32& rows, const UINT32& columns) {
+    if (_packed) {
+        deallocate();
+        _packed = false;
+    }
+    _symmetric = false;
     // if new matrix size is smaller than or equal to the previous
     // matrix size, then simply change dimensions and return
-    if (rows <= _mem_rows && columns <= _mem_cols) {
+    if (_buffer != nullptr && rows <= _mem_rows && columns <= _mem_cols) {
         // Zero out the unused portions when reusing buffer
         // Zero partial columns (rows beyond new row count)
         for (UINT32 col = 0; col < columns && col < _mem_cols; ++col) {
@@ -550,23 +742,46 @@ void matrix_2d::redim(const UINT32& rows, const UINT32& columns) {
     buy(rows, columns, &new_buffer);
     
     // Copy old data to new buffer if there was any
+    bool old_owns = _owns_buffer;
     if (old_buffer != nullptr && old_rows > 0 && old_cols > 0) {
         for (UINT32 col = 0; col < old_cols && col < columns; ++col) {
             for (UINT32 row = 0; row < old_rows && row < rows; ++row) {
                 new_buffer[col * rows + row] = old_buffer[col * old_rows + row];
             }
         }
-        // Delete old buffer
-        delete[] old_buffer;
+        if (old_owns)
+            std::free(old_buffer);
     }
     
     _buffer = new_buffer;
-    
+    _owns_buffer = true;
 
     _rows = _mem_rows = rows;
     _cols = _mem_cols = columns;
 }
 
+void matrix_2d::redim_packed(const UINT32& n) {
+    std::size_t ps = packed_size(n);
+
+    if (_buffer != nullptr && _packed && n <= _mem_rows) {
+        std::memset(_buffer, 0, ps * sizeof(double));
+        _rows = _cols = _mem_rows = _mem_cols = n;
+        return;
+    }
+
+    deallocate();
+    _rows = _cols = _mem_rows = _mem_cols = n;
+    _packed = true;
+    _symmetric = true;
+    _matrixType = mtx_lower;
+    _buffer = static_cast<double*>(std::calloc(ps, sizeof(double)));
+    if (!_buffer) {
+        std::stringstream ss;
+        ss << "Insufficient memory for a packed " << n << " x " << n << " matrix.";
+        throw NetMemoryException(ss.str());
+    }
+}
+
 void matrix_2d::shrink(const UINT32& rows, const UINT32& columns) {
     if (rows > _rows || columns > _cols) {
         std::stringstream ss;
@@ -641,21 +856,53 @@ void matrix_2d::copybuffer(const UINT32& rowstart, const UINT32& columnstart, co
     }
 }
 
-void matrix_2d::copyelements(const UINT32& row_dest, const UINT32& column_dest, const matrix_2d& src,
-                             const UINT32& row_src, const UINT32& column_src, const UINT32& rows,
-                             const UINT32& columns) {
+void matrix_2d::copyelements_generic(const UINT32& row_dest, const UINT32& column_dest, const matrix_2d& src,
+                                     const UINT32& row_src, const UINT32& column_src, const UINT32& rows,
+                                     const UINT32& columns) {
+    assert(_buffer != nullptr && src._buffer != nullptr);
+    assert(row_dest + rows <= _mem_rows && "copyelements_generic(): dest row overflow");
+    assert(column_dest + columns <= _mem_cols && "copyelements_generic(): dest col overflow");
+    assert(row_src + rows <= src._mem_rows && "copyelements_generic(): src row overflow");
+    assert(column_src + columns <= src._mem_cols && "copyelements_generic(): src col overflow");
+    // Fast path: packed source → dense dest (avoids per-element get/put overhead)
+    if (src._packed && !_packed) {
+        for (UINT32 c = 0; c < columns; ++c) {
+            double* d = _buffer + static_cast<std::size_t>(column_dest + c) * _mem_rows + row_dest;
+            UINT32 sc = column_src + c;
+            for (UINT32 r = 0; r < rows; ++r) {
+                UINT32 sr = row_src + r, sj = sc;
+                if (sr < sj) std::swap(sr, sj);
+                d[r] = src._buffer[packed_index(src._rows, sr, sj)];
+            }
+        }
+        return;
+    }
+    // Fast path: dense source → packed dest
+    if (_packed && !src._packed && !src._symmetric) {
+        for (UINT32 c = 0; c < columns; ++c) {
+            const double* s = src._buffer + static_cast<std::size_t>(column_src + c) * src._mem_rows + row_src;
+            UINT32 dc = column_dest + c;
+            for (UINT32 r = 0; r < rows; ++r) {
+                UINT32 dr = row_dest + r;
+                if (dr >= dc)
+                    _buffer[packed_index(_rows, dr, dc)] = s[r];
+            }
+        }
+        return;
+    }
+    // Fallback for remaining packed/symmetric combinations
+    if (src._symmetric || src._packed || _packed) {
+        for (UINT32 r = 0; r < rows; ++r)
+            for (UINT32 c = 0; c < columns; ++c)
+                put(row_dest + r, column_dest + c, src.get(row_src + r, column_src + c));
+        return;
+    }
     UINT32 cd(0), cs(0), colend_dest(column_dest + columns);
     for (cd = column_dest, cs = column_src; cd < colend_dest; ++cd, ++cs)
         memcpy(getelementref(row_dest, cd), src.getbuffer(row_src, cs),
                static_cast<std::size_t>(rows) * sizeof(double));
 }
 
-void matrix_2d::copyelements(const UINT32& row_dest, const UINT32& column_dest, const matrix_2d* src,
-                             const UINT32& row_src, const UINT32& column_src, const UINT32& rows,
-                             const UINT32& columns) {
-    copyelements(row_dest, column_dest, *src, row_src, column_src, rows, columns);
-}
-
 void matrix_2d::sweep(UINT32 k1, UINT32 k2) {
     double eps(1.0e-8), d;
     UINT32 i, j, k, it;
@@ -705,16 +952,67 @@ matrix_2d matrix_2d::sweepinverse() {
     return *this;
 }
 
-matrix_2d matrix_2d::cholesky_inverse(bool LOWER_IS_CLEARED /*=false*/) {
+matrix_2d matrix_2d::cholesky_inverse(bool LOWER_IS_CLEARED /*=false*/, bool mark_symmetric /*=false*/) {
     if (_rows < 1) return *this;
     if (_rows != _cols) throw std::runtime_error("cholesky_inverse(): Matrix is not square.");
 
+    assert(_buffer != nullptr && "cholesky_inverse(): null buffer");
+    assert(_mem_rows >= _rows && "cholesky_inverse(): mem_rows < rows");
+    assert(_mem_cols >= _cols && "cholesky_inverse(): mem_cols < cols");
+    assert(!(mark_symmetric && LOWER_IS_CLEARED) &&
+           "cholesky_inverse(): mark_symmetric with LOWER_IS_CLEARED not supported");
+
+    if (_packed) {
+        // Unpack to full format for fast blocked dpotrf/dpotri, then repack.
+        // dpptrf/dpptri are element-by-element and orders of magnitude slower.
+        lapack_int n = _rows;
+        std::size_t full_size = static_cast<std::size_t>(n) * n;
+
+        // Reuse thread-local workspace to avoid repeated allocation/deallocation.
+        // dpotrf/dpotri with uplo='L' only read the lower triangle, so no
+        // memset is needed — the unpack loop sets all lower-triangle elements.
+        static thread_local std::vector<double> chol_workspace;
+        if (chol_workspace.size() < full_size)
+            chol_workspace.resize(full_size);
+        double* full = chol_workspace.data();
+
+        for (UINT32 j = 0; j < _rows; ++j)
+            for (UINT32 i = j; i < _rows; ++i)
+                full[static_cast<std::size_t>(j) * n + i] = _buffer[packed_index(_rows, i, j)];
+
+        int blas_threads = blas_threads_for(n);
+#if defined(USE_MKL) || defined(__MKL__)
+        mkl_set_num_threads(blas_threads);
+#elif !defined(__APPLE__)
+        openblas_set_num_threads(blas_threads);
+#endif
+
+        char uplo = LOWER_TRIANGLE;
+        lapack_int info, lda = n;
+        LAPACK_FUNC(dpotrf)(&uplo, &n, full, &lda, &info);
+        if (info != 0) throw MatrixInversionFailure("Matrix inversion failed, the matrix is singular.");
+        LAPACK_FUNC(dpotri)(&uplo, &n, full, &lda, &info);
+        if (info != 0) throw MatrixInversionFailure("Matrix inversion failed, the matrix is singular.");
+
+        for (UINT32 j = 0; j < _rows; ++j)
+            for (UINT32 i = j; i < _rows; ++i)
+                _buffer[packed_index(_rows, i, j)] = full[static_cast<std::size_t>(j) * n + i];
+        return *this;
+    }
+
     char uplo(LOWER_TRIANGLE);
     if (LOWER_IS_CLEARED) uplo = UPPER_TRIANGLE;
 
     lapack_int info, n = _rows;
     lapack_int lda = _mem_rows;
 
+    int blas_threads = blas_threads_for(n);
+#if defined(USE_MKL) || defined(__MKL__)
+    mkl_set_num_threads(blas_threads);
+#elif !defined(__APPLE__)
+    openblas_set_num_threads(blas_threads);
+#endif
+
     // Perform Cholesky factorisation
     LAPACK_FUNC(dpotrf)(&uplo, &n, _buffer, &lda, &info);
 
@@ -727,23 +1025,195 @@ matrix_2d matrix_2d::cholesky_inverse(bool LOWER_IS_CLEARED /*=false*/) {
     if (info != 0)
         throw MatrixInversionFailure("Matrix inversion failed, the matrix is singular.");
 
-    if (LOWER_IS_CLEARED)
+    if (mark_symmetric) {
+        _symmetric = true;
+    } else if (LOWER_IS_CLEARED) {
         filllower();
-    else
+    } else {
         fillupper();
+    }
+
+    return *this;
+}
+
+matrix_2d matrix_2d::cholesky_factor(bool LOWER_IS_CLEARED /*=false*/) {
+    if (_rows < 1) return *this;
+    if (_rows != _cols) throw std::runtime_error("cholesky_factor(): Matrix is not square.");
+
+    assert(_buffer != nullptr && "cholesky_factor(): null buffer");
+    assert(_mem_rows >= _rows && "cholesky_factor(): mem_rows < rows");
+    assert(_mem_cols >= _cols && "cholesky_factor(): mem_cols < cols");
+
+    if (_packed) {
+        // Unpack to full format for dpotrf (same approach as cholesky_inverse).
+        // The factor L is stored back in packed format.
+        lapack_int n = _rows;
+        std::size_t full_size = static_cast<std::size_t>(n) * n;
+
+        // Reuse thread-local workspace; no memset needed (dpotrf reads lower triangle only)
+        static thread_local std::vector<double> chol_workspace;
+        if (chol_workspace.size() < full_size)
+            chol_workspace.resize(full_size);
+        double* full = chol_workspace.data();
+
+        for (UINT32 j = 0; j < _rows; ++j)
+            for (UINT32 i = j; i < _rows; ++i)
+                full[static_cast<std::size_t>(j) * n + i] = _buffer[packed_index(_rows, i, j)];
+
+        int blas_threads = blas_threads_for(n);
+#if defined(USE_MKL) || defined(__MKL__)
+        mkl_set_num_threads(blas_threads);
+#elif !defined(__APPLE__)
+        openblas_set_num_threads(blas_threads);
+#endif
+
+        char uplo = LOWER_TRIANGLE;
+        lapack_int info, lda = n;
+        LAPACK_FUNC(dpotrf)(&uplo, &n, full, &lda, &info);
+        if (info != 0) throw MatrixInversionFailure("Cholesky factorisation failed, the matrix is singular.");
+
+        // Store the L factor back in packed format
+        for (UINT32 j = 0; j < _rows; ++j)
+            for (UINT32 i = j; i < _rows; ++i)
+                _buffer[packed_index(_rows, i, j)] = full[static_cast<std::size_t>(j) * n + i];
+        return *this;
+    }
+
+    char uplo(LOWER_TRIANGLE);
+    if (LOWER_IS_CLEARED) uplo = UPPER_TRIANGLE;
+
+    lapack_int info, n = _rows;
+    lapack_int lda = _mem_rows;
+
+    int blas_threads = blas_threads_for(n);
+#if defined(USE_MKL) || defined(__MKL__)
+    mkl_set_num_threads(blas_threads);
+#elif !defined(__APPLE__)
+    openblas_set_num_threads(blas_threads);
+#endif
+
+    LAPACK_FUNC(dpotrf)(&uplo, &n, _buffer, &lda, &info);
+
+    if (info != 0)
+        throw MatrixInversionFailure("Cholesky factorisation failed, the matrix is singular.");
 
     return *this;
 }
 
+void matrix_2d::cholesky_solve(matrix_2d& rhs, bool LOWER_IS_CLEARED /*=false*/) {
+    assert(_rows == _cols && "cholesky_solve(): factor matrix is not square");
+    assert(_rows == rhs._rows && "cholesky_solve(): dimension mismatch");
+    assert(_buffer != nullptr && "cholesky_solve(): null factor buffer");
+    assert(rhs._buffer != nullptr && "cholesky_solve(): null rhs buffer");
+
+    if (_packed) {
+        // Unpack the L factor to full format for dpotrs.
+        lapack_int n = _rows;
+        std::size_t full_size = static_cast<std::size_t>(n) * n;
+
+        // Reuse thread-local workspace; no memset needed (dpotrs reads lower triangle only)
+        static thread_local std::vector<double> chol_workspace;
+        if (chol_workspace.size() < full_size)
+            chol_workspace.resize(full_size);
+        double* full = chol_workspace.data();
+
+        for (UINT32 j = 0; j < _rows; ++j)
+            for (UINT32 i = j; i < _rows; ++i)
+                full[static_cast<std::size_t>(j) * n + i] = _buffer[packed_index(_rows, i, j)];
+
+        char uplo = LOWER_TRIANGLE;
+        lapack_int nrhs = rhs._cols;
+        lapack_int lda = n;
+        lapack_int ldb = rhs._mem_rows;
+        lapack_int info;
+
+        LAPACK_FUNC(dpotrs)(&uplo, &n, &nrhs, full, &lda, rhs._buffer, &ldb, &info);
+
+        if (info != 0)
+            throw MatrixInversionFailure("Cholesky solve failed.");
+        return;
+    }
+
+    char uplo(LOWER_TRIANGLE);
+    if (LOWER_IS_CLEARED) uplo = UPPER_TRIANGLE;
+
+    lapack_int n = _rows;
+    lapack_int nrhs = rhs._cols;
+    lapack_int lda = _mem_rows;
+    lapack_int ldb = rhs._mem_rows;
+    lapack_int info;
+
+    LAPACK_FUNC(dpotrs)(&uplo, &n, &nrhs, _buffer, &lda, rhs._buffer, &ldb, &info);
+
+    if (info != 0)
+        throw MatrixInversionFailure("Cholesky solve failed.");
+}
+
+double matrix_2d::dot(const matrix_2d& other) const {
+    assert(_cols == 1 && other._cols == 1 && "dot(): both matrices must be column vectors");
+    assert(_rows == other._rows && "dot(): dimension mismatch");
+    assert(_buffer != nullptr && other._buffer != nullptr && "dot(): null buffer");
+
+    double result = 0.0;
+    for (UINT32 i = 0; i < _rows; ++i)
+        result += get(i, 0) * other.get(i, 0);
+    return result;
+}
+
 matrix_2d matrix_2d::scale(const double& scalar) {
+    if (_packed) {
+        std::size_t ps = packed_size(_rows);
+        for (std::size_t k = 0; k < ps; ++k)
+            _buffer[k] *= scalar;
+        return *this;
+    }
     UINT32 i, j;
     for (i = 0; i < _rows; ++i)
         for (j = 0; j < _cols; ++j) *getelementref(i, j) *= scalar;
     return *this;
 }
 
-void matrix_2d::blockadd(const UINT32& row_dest, const UINT32& col_dest, const matrix_2d& mat_src,
-                         const UINT32& row_src, const UINT32& col_src, const UINT32& rows, const UINT32& cols) {
+void matrix_2d::scale_symmetric_diagonal(const double* diag) {
+    assert(_symmetric && "scale_symmetric_diagonal(): matrix must be symmetric");
+    if (_packed) {
+        for (UINT32 j = 0; j < _rows; ++j)
+            for (UINT32 i = j; i < _rows; ++i)
+                _buffer[packed_index(_rows, i, j)] *= diag[i] * diag[j];
+        return;
+    }
+    for (UINT32 j = 0; j < _rows; ++j)
+        for (UINT32 i = j; i < _rows; ++i) {
+            double s = diag[i] * diag[j];
+            *getelementref(i, j) *= s;
+        }
+}
+
+void matrix_2d::blockadd_generic(const UINT32& row_dest, const UINT32& col_dest, const matrix_2d& mat_src,
+                                 const UINT32& row_src, const UINT32& col_src, const UINT32& rows, const UINT32& cols) {
+    // Fast path: dense source → packed dest (avoids per-element get/elementadd overhead)
+    if (_packed && !mat_src._packed && !mat_src._symmetric) {
+        for (UINT32 c = 0; c < cols; ++c) {
+            const double* s = mat_src._buffer + static_cast<std::size_t>(col_src + c) * mat_src._mem_rows + row_src;
+            UINT32 dc = col_dest + c;
+            for (UINT32 r = 0; r < rows; ++r) {
+                UINT32 dr = row_dest + r;
+                if (dr >= dc)
+                    _buffer[packed_index(_rows, dr, dc)] += s[r];
+            }
+        }
+        return;
+    }
+    // Fast path: both dense (non-packed, non-symmetric) — use direct buffer arithmetic
+    if (!_packed && !mat_src._packed && !mat_src._symmetric) {
+        for (UINT32 c = 0; c < cols; ++c) {
+            double* d = _buffer + static_cast<std::size_t>(col_dest + c) * _mem_rows + row_dest;
+            const double* s = mat_src._buffer + static_cast<std::size_t>(col_src + c) * mat_src._mem_rows + row_src;
+            for (UINT32 r = 0; r < rows; ++r)
+                d[r] += s[r];
+        }
+        return;
+    }
+    // Fallback: generic per-element path
     UINT32 i_dest, j_dest, i_src, j_src;
     UINT32 i_dest_end(row_dest + rows), j_dest_end(col_dest + cols);
 
@@ -752,9 +1222,8 @@ void matrix_2d::blockadd(const UINT32& row_dest, const UINT32& col_dest, const m
             elementadd(i_dest, j_dest, mat_src.get(i_src, j_src));
 }
 
-// Same as blockadd, but adds transpose.  mat_src must be square.
-void matrix_2d::blockTadd(const UINT32& row_dest, const UINT32& col_dest, const matrix_2d& mat_src,
-                          const UINT32& row_src, const UINT32& col_src, const UINT32& rows, const UINT32& cols) {
+void matrix_2d::blockTadd_generic(const UINT32& row_dest, const UINT32& col_dest, const matrix_2d& mat_src,
+                                  const UINT32& row_src, const UINT32& col_src, const UINT32& rows, const UINT32& cols) {
     UINT32 i_dest, j_dest, i_src, j_src;
     UINT32 i_dest_end(row_dest + rows), j_dest_end(col_dest + cols);
 
@@ -773,79 +1242,153 @@ void matrix_2d::blocksubtract(const UINT32& row_dest, const UINT32& col_dest, co
             elementsubtract(i_dest, j_dest, mat_src.get(i_src, j_src));
 }
 
-// clearlower()
 void matrix_2d::clearlower() {
+    assert(_buffer != nullptr);
+    assert(!_packed && "clearlower(): not valid for packed storage");
     // Sets lower triangle elements to zero
     UINT32 col, row;
-    for (row = 1, col = 0; col < _mem_cols; ++col, ++row)
+    for (row = 1, col = 0; col < _mem_cols && row < _mem_rows; ++col, ++row)
         memset(getelementref(row, col), 0, (static_cast<std::size_t>(_mem_rows) - row) * sizeof(double));
 }
 
-// clearupper()
 void matrix_2d::clearupper() {
+    assert(!_packed && "clearupper(): not valid for packed storage");
     // Sets upper triangle elements to zero
-    UINT32 col, row;
-    for (row = 0; row < _rows; ++row)
-        for (col = row + 1; col < _cols; ++col) put(row, col, 0.0);
+    // Column-major: for each column, zero the rows above the diagonal
+    for (UINT32 col = 1; col < _cols; ++col)
+        memset(_buffer + col * _mem_rows, 0, col * sizeof(double));
 }
 
-// filllower()
 void matrix_2d::filllower() {
-    // copies upper triangle to lower triangle
-    UINT32 column, row;
-    for (row = 1; row < _rows; row++)
-        for (column = 0; column < row; column++) put(row, column, get(column, row));
+    assert(_buffer != nullptr);
+    assert(!_packed && "filllower(): not valid for packed storage");
+    assert(_rows == _cols && "filllower(): matrix must be square");
+    // Copy upper triangle to lower: A(row,col) = A(col,row) for row > col
+    // Column-major: A(r,c) = _buffer[c * _mem_rows + r]
+    // Outer loop over destination columns so writes are sequential in memory.
+    const std::size_t mr = _mem_rows;
+    constexpr UINT32 BLK = 64;
+    for (UINT32 cb = 0; cb < _cols; cb += BLK) {
+        UINT32 ce = std::min(cb + BLK, _cols);
+        for (UINT32 rb = ce; rb < _rows; rb += BLK) {
+            UINT32 re = std::min(rb + BLK, _rows);
+            for (UINT32 col = cb; col < ce; col++) {
+                double* dst = _buffer + col * mr;
+                for (UINT32 row = rb; row < re; row++)
+                    dst[row] = _buffer[row * mr + col];
+            }
+        }
+    }
+    // Diagonal blocks: row > col within the same block
+    for (UINT32 cb = 0; cb < _cols; cb += BLK) {
+        UINT32 ce = std::min(cb + BLK, _cols);
+        for (UINT32 col = cb; col < ce; col++) {
+            double* dst = _buffer + col * mr;
+            for (UINT32 row = col + 1; row < ce; row++)
+                dst[row] = _buffer[row * mr + col];
+        }
+    }
 }
 
-// fillupper()
 void matrix_2d::fillupper() {
-    // copies lower triangle to upper triangle
-    UINT32 column, row;
-    for (row = 1; row < _rows; row++)
-        for (column = 0; column < row; column++) put(column, row, get(row, column));
+    assert(_buffer != nullptr);
+    assert(!_packed && "fillupper(): not valid for packed storage");
+    assert(_rows == _cols && "fillupper(): matrix must be square");
+    // Copy lower triangle to upper: A(r,c) = A(c,r) for r < c
+    // Column-major: A(r,c) = _buffer[c * _mem_rows + r]
+    //
+    // Outer loop over destination columns so writes are sequential in memory.
+    // Reads are strided (one element per source column) but write-combining
+    // and store buffers make sequential writes much faster than sequential reads
+    // for large matrices.
+    const std::size_t mr = _mem_rows;
+    constexpr UINT32 BLK = 64;
+    for (UINT32 cb = 0; cb < _cols; cb += BLK) {
+        UINT32 ce = std::min(cb + BLK, _cols);
+        for (UINT32 rb = 0; rb < ce; rb += BLK) {
+            UINT32 re = std::min(rb + BLK, ce);
+            for (UINT32 col = cb; col < ce; col++) {
+                double* dst = _buffer + col * mr;
+                UINT32 r0 = rb;
+                UINT32 r1 = std::min(re, col);
+                for (UINT32 row = r0; row < r1; row++)
+                    dst[row] = _buffer[row * mr + col];
+            }
+        }
+    }
 }
 
-// zero()
-void matrix_2d::zero() { memset(_buffer, 0, buffersize()); }
+void matrix_2d::zero() {
+    assert(_buffer != nullptr && "zero(): null buffer");
+    if (_packed)
+        memset(_buffer, 0, packed_size(_mem_rows) * sizeof(double));
+    else
+        memset(_buffer, 0, buffersize());
+}
 
 // zero()
 void matrix_2d::zero(const UINT32& row_begin, const UINT32& col_begin, const UINT32& rows, const UINT32& columns) {
+    assert(_buffer != nullptr && "zero(sub): null buffer");
+    assert(row_begin + rows <= _mem_rows && "zero(sub): row overflow");
+    assert(col_begin + columns <= _mem_cols && "zero(sub): col overflow");
     UINT32 col(0), col_end(col_begin + columns);
     for (col = col_begin; col < col_end; ++col) memset(getelementref(row_begin, col), 0, rows * sizeof(double));
 }
 
 
 matrix_2d& matrix_2d::operator=(const matrix_2d& rhs) {
-    // Overloaded assignment operator
     if (this == &rhs) return *this;
 
-    // If rhs data can fit within limits of this matrix, copy
-    // and return. Otherwise, allocate new memory
+    if (rhs._packed) {
+        std::size_t ps = packed_size(rhs._rows);
+        if (_packed && _mem_rows >= rhs._rows) {
+            _rows = _cols = rhs._rows;
+            memcpy(_buffer, rhs._buffer, ps * sizeof(double));
+        } else {
+            deallocate();
+            _rows = _cols = _mem_rows = _mem_cols = rhs._rows;
+            _buffer = static_cast<double*>(std::malloc(ps * sizeof(double)));
+            if (!_buffer) throw NetMemoryException("Insufficient memory for packed matrix assignment.");
+            memcpy(_buffer, rhs._buffer, ps * sizeof(double));
+        }
+        _packed = true;
+        _symmetric = true;
+        _matrixType = rhs._matrixType;
+        _maxvalCol = rhs.maxvalueCol();
+        _maxvalRow = rhs.maxvalueRow();
+        return *this;
+    }
+
+    // rhs is not packed
+    if (_packed) {
+        deallocate();
+        _packed = false;
+        _mem_rows = _mem_cols = 0;
+    }
+
     if (_mem_rows >= rhs.rows() && _mem_cols >= rhs.columns()) {
-        // don't change _mem_rows or _mem_cols.  Simply update
-        // visible dimensions and copy buffer
         _rows = rhs.rows();
         _cols = rhs.columns();
         copybuffer(_rows, _cols, rhs);
 
-        _maxvalCol = rhs.maxvalueCol();  // col of max value
-        _maxvalRow = rhs.maxvalueRow();  // row of max value
+        _maxvalCol = rhs.maxvalueCol();
+        _maxvalRow = rhs.maxvalueRow();
+        _symmetric = rhs._symmetric;
 
         return *this;
     }
 
-    // Okay, rhs is larger, so allocate new memory. Call free
-    // memory first before changing row and column dimensions!
     deallocate();
-    _mem_rows = rhs.memRows();  // change memory limits
+    _mem_rows = rhs.memRows();
     _mem_cols = rhs.memColumns();
-    _rows = rhs.rows();  // change matrix dimensions
+    _rows = rhs.rows();
     _cols = rhs.columns();
     allocate(_mem_rows, _mem_cols);
     copybuffer(_rows, _cols, rhs);
 
-    _maxvalCol = rhs.maxvalueCol();  // col of max value
-    _maxvalRow = rhs.maxvalueRow();  // row of max value
+    _maxvalCol = rhs.maxvalueCol();
+    _maxvalRow = rhs.maxvalueRow();
+    _symmetric = rhs._symmetric;
 
     return *this;
 }
@@ -875,6 +1418,10 @@ matrix_2d matrix_2d::add(const matrix_2d& rhs) {
 // multiplies this matrix by rhs and stores the result in a new matrix
 // Uses Intel MKL dgemm
 matrix_2d matrix_2d::multiply(const char* lhs_trans, const matrix_2d& rhs, const char* rhs_trans) {
+    assert(!_symmetric && "multiply(dgemm): LHS is symmetric — use multiply_sym instead");
+    assert(!rhs._symmetric && "multiply(dgemm): RHS is symmetric — upper triangle is unpopulated");
+    assert(_buffer != nullptr && rhs._buffer != nullptr);
+
     matrix_2d m(_rows, rhs.columns());
 
     const double one = 1.0;
@@ -914,6 +1461,10 @@ matrix_2d matrix_2d::multiply(const char* lhs_trans, const matrix_2d& rhs, const
 // Uses Intel MKL dgemm
 matrix_2d
 matrix_2d::multiply(const matrix_2d& lhs, const char* lhs_trans, const matrix_2d& rhs, const char* rhs_trans) {
+    assert(!lhs._symmetric && "multiply(dgemm): LHS is symmetric — use multiply_sym instead");
+    assert(!rhs._symmetric && "multiply(dgemm): RHS is symmetric — upper triangle is unpopulated");
+    assert(lhs._buffer != nullptr && rhs._buffer != nullptr && _buffer != nullptr);
+
     const double one = 1.0;
     const double zero = 0.0;
 
@@ -947,6 +1498,48 @@ matrix_2d::multiply(const matrix_2d& lhs, const char* lhs_trans, const matrix_2d
     return *this;
 }  // Multiply()
 
+// C (this) = sym_lhs * rhs, using dsymm or dspmv (packed)
+matrix_2d matrix_2d::multiply_sym(const matrix_2d& sym_lhs, const matrix_2d& rhs) {
+    lapack_int m = sym_lhs.rows();
+    lapack_int n = rhs.columns();
+
+    assert(sym_lhs._symmetric && "multiply_sym(): LHS must be marked symmetric");
+    assert(sym_lhs._buffer != nullptr && "multiply_sym(): LHS null buffer");
+    assert(rhs._buffer != nullptr && "multiply_sym(): RHS null buffer");
+    assert(_buffer != nullptr && "multiply_sym(): result null buffer");
+    assert(_buffer != sym_lhs._buffer && _buffer != rhs._buffer &&
+           "multiply_sym(): result must not alias inputs");
+
+    if (sym_lhs.columns() != sym_lhs.rows())
+        throw std::runtime_error("multiply_sym(): LHS matrix is not square.");
+    if (static_cast<lapack_int>(rhs.rows()) != m)
+        throw std::runtime_error("multiply_sym(): Matrix dimensions are incompatible.");
+    if (_rows != static_cast<UINT32>(m) || _cols != static_cast<UINT32>(n))
+        throw std::runtime_error("multiply_sym(): Result matrix dimensions are incompatible.");
+
+    if (sym_lhs._packed) {
+        for (lapack_int j = 0; j < n; ++j) {
+            BLAS_FUNC(dspmv)(CblasColMajor, CblasLower,
+                             m, 1.0, sym_lhs._buffer,
+                             rhs._buffer + j * rhs._mem_rows, 1,
+                             0.0, _buffer + j * _mem_rows, 1);
+        }
+        return *this;
+    }
+
+    assert(sym_lhs.memRows() >= sym_lhs.rows() && "multiply_sym(): LHS LDA < M");
+    assert(rhs.memRows() >= rhs.rows() && "multiply_sym(): RHS LDA < M");
+    assert(_mem_rows >= _rows && "multiply_sym(): result LDA < M");
+
+    BLAS_FUNC(dsymm)(CblasColMajor, CblasLeft, CblasLower,
+                     m, n, 1.0,
+                     sym_lhs.getbuffer(), sym_lhs.memRows(),
+                     rhs.getbuffer(), rhs.memRows(),
+                     0.0, _buffer, _mem_rows);
+
+    return *this;
+}
+
 // Transpose()
 matrix_2d matrix_2d::transpose(const matrix_2d& matA) {
     if ((matA.columns() != _rows) || (matA.rows() != _cols))
@@ -967,9 +1560,19 @@ matrix_2d matrix_2d::transpose() {
     return m;
 }  // Transpose()
 
-// computes and retains the maximum value in the matrix
 double matrix_2d::compute_maximum_value() {
     _maxvalCol = _maxvalRow = 0;
+    if (_packed) {
+        for (UINT32 j = 0; j < _rows; ++j) {
+            for (UINT32 i = j; i < _rows; ++i) {
+                if (fabs(get(i, j)) > fabs(get(_maxvalRow, _maxvalCol))) {
+                    _maxvalRow = i;
+                    _maxvalCol = j;
+                }
+            }
+        }
+        return get(_maxvalRow, _maxvalCol);
+    }
     UINT32 col, row;
     for (row = 0; row < _rows; ++row) {
         for (col = 0; col < _cols; col++) {
diff --git a/dynadjust/include/math/dnamatrix_contiguous.hpp b/dynadjust/include/math/dnamatrix_contiguous.hpp
index be4d79c85..127e32045 100644
--- a/dynadjust/include/math/dnamatrix_contiguous.hpp
+++ b/dynadjust/include/math/dnamatrix_contiguous.hpp
@@ -23,6 +23,7 @@
 #define DNAMATRIX_CONTIGUOUS_H_
 
 /// \cond
+#include <cassert>
 #include <cstring>
 /// \endcond
 
@@ -165,14 +166,26 @@ static_assert(sizeof(lapack_int) == 4, "LP64 interface requires 32-bit integers"
 extern "C" {
 void LAPACK_FUNC(dpotrf)(const char* uplo, const lapack_int* n, double* a, const lapack_int* lda, lapack_int* info);
 void LAPACK_FUNC(dpotri)(const char* uplo, const lapack_int* n, double* a, const lapack_int* lda, lapack_int* info);
+void LAPACK_FUNC(dpptrf)(const char* uplo, const lapack_int* n, double* ap, lapack_int* info);
+void LAPACK_FUNC(dpptri)(const char* uplo, const lapack_int* n, double* ap, lapack_int* info);
 void LAPACK_FUNC(dsytrf)(const char* uplo, const lapack_int* n, double* a, const lapack_int* lda,
                          lapack_int* ipiv, double* work, const lapack_int* lwork, lapack_int* info);
 void LAPACK_FUNC(dsytri)(const char* uplo, const lapack_int* n, double* a, const lapack_int* lda,
                          const lapack_int* ipiv, double* work, lapack_int* info);
+void LAPACK_FUNC(dpotrs)(const char* uplo, const lapack_int* n, const lapack_int* nrhs,
+                         const double* a, const lapack_int* lda, double* b, const lapack_int* ldb, lapack_int* info);
 void BLAS_FUNC(dgemm)(const enum CBLAS_ORDER ORDER, const enum CBLAS_TRANSPOSE TRANSA,
                       const enum CBLAS_TRANSPOSE TRANSB, const lapack_int M, const lapack_int N, const lapack_int K,
                       const double ALPHA, const double* A, const lapack_int LDA, const double* B, const lapack_int LDB,
                       const double BETA, double* C, const lapack_int LDC);
+void BLAS_FUNC(dsymm)(const enum CBLAS_ORDER ORDER, const enum CBLAS_SIDE SIDE, const enum CBLAS_UPLO UPLO,
+                      const lapack_int M, const lapack_int N,
+                      const double ALPHA, const double* A, const lapack_int LDA, const double* B, const lapack_int LDB,
+                      const double BETA, double* C, const lapack_int LDC);
+void BLAS_FUNC(dspmv)(const enum CBLAS_ORDER ORDER, const enum CBLAS_UPLO UPLO,
+                      const lapack_int N, const double ALPHA, const double* AP,
+                      const double* X, const lapack_int INCX,
+                      const double BETA, double* Y, const lapack_int INCY);
 }
 #endif
 
@@ -187,6 +200,10 @@ class MatrixInversionFailure : public std::runtime_error {
     using std::runtime_error::runtime_error;
 };
 
+// Set/get the maximum BLAS thread count used by matrix operations.
+void set_max_blas_threads(int n);
+int get_max_blas_threads();
+
 class matrix_2d;
 typedef std::vector<matrix_2d> v_mat_2d, *pv_mat_2d;
 typedef v_mat_2d::iterator _it_v_mat_2d;
@@ -220,9 +237,23 @@ class matrix_2d : public new_handler_support<matrix_2d> {
     // element retrieval
     // see DNAMATRIX_ROW_WISE
     inline double& get(const UINT32& row, const UINT32& column) const {
+        assert(_buffer != nullptr);
+        assert(row < _mem_rows && "get(): row out of bounds");
+        assert(column < _mem_cols && "get(): column out of bounds");
+        if (_packed) {
+            UINT32 i = row, j = column;
+            if (i < j) { i = column; j = row; }
+            return _buffer[packed_index(_rows, i, j)];
+        }
+        if (_symmetric && row < column)
+            return DNAMATRIX_ELEMENT(_buffer, _mem_rows, _mem_cols, column, row);
         return DNAMATRIX_ELEMENT(_buffer, _mem_rows, _mem_cols, row, column);
     }
     inline double* getbuffer(const UINT32& row, const UINT32& column) const {
+        assert(_buffer != nullptr);
+        assert(!_packed && "getbuffer(row,col): not valid for packed storage");
+        assert(row < _mem_rows && "getbuffer(): row out of bounds");
+        assert(column < _mem_cols && "getbuffer(): column out of bounds");
         return _buffer + DNAMATRIX_INDEX(_mem_rows, _mem_cols, row, column);
     }
 
@@ -236,9 +267,29 @@ class matrix_2d : public new_handler_support<matrix_2d> {
     inline UINT32 maxvalueCol() const { return _maxvalCol; }
 
     inline double* getelementref(const UINT32& row, const UINT32& column) const {
+        assert(_buffer != nullptr);
+        assert(row < _mem_rows && "getelementref(): row out of bounds");
+        assert(column < _mem_cols && "getelementref(): column out of bounds");
+        if (_packed) {
+            UINT32 i = row, j = column;
+            if (i < j) { i = column; j = row; }
+            return &_buffer[packed_index(_rows, i, j)];
+        }
+        if (_symmetric && row < column)
+            return &(DNAMATRIX_ELEMENT(_buffer, _mem_rows, _mem_cols, column, row));
         return &(DNAMATRIX_ELEMENT(_buffer, _mem_rows, _mem_cols, row, column));
     }
     inline double* getelementref(const UINT32& row, const UINT32& column) {
+        assert(_buffer != nullptr);
+        assert(row < _mem_rows && "getelementref(): row out of bounds");
+        assert(column < _mem_cols && "getelementref(): column out of bounds");
+        if (_packed) {
+            UINT32 i = row, j = column;
+            if (i < j) { i = column; j = row; }
+            return &_buffer[packed_index(_rows, i, j)];
+        }
+        if (_symmetric && row < column)
+            return &(DNAMATRIX_ELEMENT(_buffer, _mem_rows, _mem_cols, column, row));
         return &(DNAMATRIX_ELEMENT(_buffer, _mem_rows, _mem_cols, row, column));
     }
 
@@ -250,34 +301,124 @@ class matrix_2d : public new_handler_support<matrix_2d> {
     inline void maxvalueCol(const UINT32& c) { _maxvalCol = c; }
 
     inline void put(const UINT32& row, const UINT32& column, const double& value) {
+        assert(_buffer != nullptr);
+        assert(row < _mem_rows && "put(): row out of bounds");
+        assert(column < _mem_cols && "put(): column out of bounds");
+        if (_packed) {
+            UINT32 i = row, j = column;
+            if (i < j) { i = column; j = row; }
+            _buffer[packed_index(_rows, i, j)] = value;
+            return;
+        }
         DNAMATRIX_ELEMENT(_buffer, _mem_rows, _mem_cols, row, column) = value;
     }
 
     inline UINT32 matrixType() const { return _matrixType; }
     inline void matrixType(const UINT32 t) { _matrixType = t; }
 
+    inline bool is_symmetric() const { return _symmetric; }
+    inline void set_symmetric(bool s) {
+        assert((!s || _rows == _cols) && "set_symmetric(true): matrix must be square");
+        _symmetric = s;
+    }
+
+    inline bool is_packed() const { return _packed; }
+
+    static inline std::size_t packed_index(UINT32 n, UINT32 i, UINT32 j) {
+        return static_cast<std::size_t>(j) * n - static_cast<std::size_t>(j) * (j - 1) / 2 + (i - j);
+    }
+
+    static inline std::size_t packed_size(UINT32 n) {
+        return static_cast<std::size_t>(n) * (n + 1) / 2;
+    }
+
     // Matrix functions
-    void copyelements(const UINT32& row_dest, const UINT32& column_dest, const matrix_2d& src, const UINT32& row_src,
-                      const UINT32& column_src, const UINT32& rows, const UINT32& columns);
-    void copyelements(const UINT32& row_dest, const UINT32& column_dest, const matrix_2d* src, const UINT32& row_src,
-                      const UINT32& column_src, const UINT32& rows, const UINT32& columns);
+    inline void copyelements(const UINT32& row_dest, const UINT32& column_dest, const matrix_2d& src,
+                             const UINT32& row_src, const UINT32& column_src,
+                             const UINT32& rows, const UINT32& columns) {
+        assert(_buffer != nullptr && src._buffer != nullptr);
+        assert(row_dest + rows <= _mem_rows && "copyelements(): dest row overflow");
+        assert(column_dest + columns <= _mem_cols && "copyelements(): dest col overflow");
+        assert(row_src + rows <= src._mem_rows && "copyelements(): src row overflow");
+        assert(column_src + columns <= src._mem_cols && "copyelements(): src col overflow");
+        if (rows == 3 && columns == 3 && !src._symmetric && !src._packed && !_packed) {
+            for (UINT32 c = 0; c < 3; ++c) {
+                double* dst = _buffer + static_cast<std::size_t>(column_dest + c) * _mem_rows + row_dest;
+                const double* s = src._buffer + static_cast<std::size_t>(column_src + c) * src._mem_rows + row_src;
+                dst[0] = s[0]; dst[1] = s[1]; dst[2] = s[2];
+            }
+            return;
+        }
+        copyelements_generic(row_dest, column_dest, src, row_src, column_src, rows, columns);
+    }
+    void copyelements_generic(const UINT32& row_dest, const UINT32& column_dest, const matrix_2d& src,
+                              const UINT32& row_src, const UINT32& column_src,
+                              const UINT32& rows, const UINT32& columns);
+    inline void copyelements(const UINT32& row_dest, const UINT32& column_dest, const matrix_2d* src,
+                             const UINT32& row_src, const UINT32& column_src,
+                             const UINT32& rows, const UINT32& columns) {
+        copyelements(row_dest, column_dest, *src, row_src, column_src, rows, columns);
+    }
 
     inline void elementadd(const UINT32& row, const UINT32& column, const double& increment) {
+        assert(row < _rows && column < _cols && "elementadd(): out of bounds");
+        if (_packed && row < column) return;
         *getelementref(row, column) += increment;
     }
 
     inline void elementsubtract(const UINT32& row, const UINT32& column, const double& decrement) {
+        assert(row < _rows && column < _cols && "elementsubtract(): out of bounds");
+        if (_packed && row < column) return;
         *getelementref(row, column) -= decrement;
     }
 
     inline void elementmultiply(const UINT32& row, const UINT32& column, const double& scale) {
+        assert(row < _rows && column < _cols && "elementmultiply(): out of bounds");
         *getelementref(row, column) *= scale;
     }
 
-    void blockadd(const UINT32& row_dest, const UINT32& col_dest, const matrix_2d& mat_src, const UINT32& row_src,
-                  const UINT32& col_src, const UINT32& rows, const UINT32& cols);
-    void blockTadd(const UINT32& row_dest, const UINT32& col_dest, const matrix_2d& mat_src, const UINT32& row_src,
-                   const UINT32& col_src, const UINT32& rows, const UINT32& cols);
+    inline void blockadd(const UINT32& row_dest, const UINT32& col_dest, const matrix_2d& mat_src,
+                         const UINT32& row_src, const UINT32& col_src, const UINT32& rows, const UINT32& cols) {
+        assert(_buffer != nullptr && mat_src._buffer != nullptr);
+        assert(row_dest + rows <= _mem_rows && "blockadd(): dest row overflow");
+        assert(col_dest + cols <= _mem_cols && "blockadd(): dest col overflow");
+        assert(row_src + rows <= mat_src._mem_rows && "blockadd(): src row overflow");
+        assert(col_src + cols <= mat_src._mem_cols && "blockadd(): src col overflow");
+        if (rows == 3 && cols == 3 && !mat_src._symmetric && !mat_src._packed && !_packed) {
+            for (UINT32 c = 0; c < 3; ++c) {
+                double* dst = _buffer + static_cast<std::size_t>(col_dest + c) * _mem_rows + row_dest;
+                const double* src = mat_src._buffer + static_cast<std::size_t>(col_src + c) * mat_src._mem_rows + row_src;
+                dst[0] += src[0]; dst[1] += src[1]; dst[2] += src[2];
+            }
+            return;
+        }
+        blockadd_generic(row_dest, col_dest, mat_src, row_src, col_src, rows, cols);
+    }
+    void blockadd_generic(const UINT32& row_dest, const UINT32& col_dest, const matrix_2d& mat_src,
+                          const UINT32& row_src, const UINT32& col_src, const UINT32& rows, const UINT32& cols);
+
+    inline void blockTadd(const UINT32& row_dest, const UINT32& col_dest, const matrix_2d& mat_src,
+                          const UINT32& row_src, const UINT32& col_src, const UINT32& rows, const UINT32& cols) {
+        assert(_buffer != nullptr && mat_src._buffer != nullptr);
+        assert(row_dest + rows <= _mem_rows && "blockTadd(): dest row overflow");
+        assert(col_dest + cols <= _mem_cols && "blockTadd(): dest col overflow");
+        assert(row_src + cols <= mat_src._mem_rows && "blockTadd(): src row overflow (transposed)");
+        assert(col_src + rows <= mat_src._mem_cols && "blockTadd(): src col overflow (transposed)");
+        if (rows == 3 && cols == 3 && !_packed) {
+            for (UINT32 c = 0; c < 3; ++c) {
+                double* dst = _buffer + static_cast<std::size_t>(col_dest + c) * _mem_rows + row_dest;
+                const std::size_t smr = mat_src._mem_rows;
+                const double* src_r0 = mat_src._buffer + static_cast<std::size_t>(row_src) * smr + col_src + c;
+                dst[0] += src_r0[0];
+                dst[1] += src_r0[smr];
+                dst[2] += src_r0[2 * smr];
+            }
+            return;
+        }
+        blockTadd_generic(row_dest, col_dest, mat_src, row_src, col_src, rows, cols);
+    }
+    void blockTadd_generic(const UINT32& row_dest, const UINT32& col_dest, const matrix_2d& mat_src,
+                           const UINT32& row_src, const UINT32& col_src, const UINT32& rows, const UINT32& cols);
     void blocksubtract(const UINT32& row_dest, const UINT32& col_dest, const matrix_2d& mat_src, const UINT32& row_src,
                        const UINT32& col_src, const UINT32& rows, const UINT32& cols);
 
@@ -287,13 +428,19 @@ class matrix_2d : public new_handler_support<matrix_2d> {
     matrix_2d multiply(const char* lhs_trans, const matrix_2d& rhs, const char* rhs_trans); // multiplication
     matrix_2d multiply(const matrix_2d& lhs, const char* lhs_trans, const matrix_2d& rhs,
                        const char* rhs_trans); // multiplication
+    // C = sym_lhs * rhs using dsymm (sym_lhs must be symmetric, lower triangle populated)
+    matrix_2d multiply_sym(const matrix_2d& sym_lhs, const matrix_2d& rhs);
 
     matrix_2d sweepinverse();                                  // Sweep inverse (good for rotation matrices)
-    matrix_2d cholesky_inverse(bool LOWER_IS_CLEARED = false); // Cholesky inverse
+    matrix_2d cholesky_inverse(bool LOWER_IS_CLEARED = false, bool mark_symmetric = false); // Cholesky inverse
+    matrix_2d cholesky_factor(bool LOWER_IS_CLEARED = false);  // Cholesky factor only (dpotrf), no inverse
+    void cholesky_solve(matrix_2d& rhs, bool LOWER_IS_CLEARED = false);  // Solve from pre-factored matrix (dpotrs)
+    double dot(const matrix_2d& other) const;  // Inner product of two column vectors
 
     matrix_2d transpose(const matrix_2d&); // Transpose
     matrix_2d transpose();                 //  ''
     matrix_2d scale(const double& scalar); // scale
+    void scale_symmetric_diagonal(const double* diag);
 
     // overloaded operators
     // equality
@@ -347,6 +494,7 @@ class matrix_2d : public new_handler_support<matrix_2d> {
     void setsize(const UINT32& rows,
                  const UINT32& columns); // sets matrix size to rows * columns only (buffer not allocated any memory)
     void redim(const UINT32& rows, const UINT32& columns); // redimensions matrix to rows * columns
+    void redim_packed(const UINT32& n);                    // redimensions to packed symmetric n×n
     void replace(const UINT32& rowstart, const UINT32& columnstart, const matrix_2d& newmat);
     void replace(const UINT32& rowstart, const UINT32& columnstart, const UINT32& rows, const UINT32& columns,
                  const matrix_2d& newmat);
@@ -369,6 +517,14 @@ class matrix_2d : public new_handler_support<matrix_2d> {
     // Writing to memory mapped file
     void WriteMappedFileRegion(void* addr);
 
+    // In-place mmap: attach _buffer directly to mmap data region (no memcpy)
+    void AttachMappedFileRegion(void* addr);
+
+    // In-place mmap: write footer back, detach _buffer
+    void DetachMappedFileRegion(void* addr);
+
+    inline bool owns_buffer() const { return _owns_buffer; }
+
     // debug
 #ifdef _MSDEBUG
     void trace(const std::string& comment, const std::string& format) const;
@@ -376,12 +532,14 @@ class matrix_2d : public new_handler_support<matrix_2d> {
                const UINT32& row_begin, const UINT32& col_begin, const UINT32& rows, const UINT32& columns) const;
 #endif
 
+    void deallocate();
+
   private:
     inline std::size_t buffersize() const {
+        if (_packed)
+            return packed_size(_mem_rows) * sizeof(double);
         return static_cast<std::size_t>(_mem_rows) * static_cast<std::size_t>(_mem_cols) * sizeof(double);
     }
-
-    void deallocate();
     void buy(const UINT32& rows, const UINT32& columns, double** mem_space);
     void copybuffer(const UINT32& rows, const UINT32& columns, const matrix_2d& oldmat);
     void copybuffer(const UINT32& rowstart, const UINT32& columnstart, const UINT32& rows, const UINT32& columns,
@@ -395,11 +553,14 @@ class matrix_2d : public new_handler_support<matrix_2d> {
     UINT32 _cols;     // number of actual cols
     UINT32 _rows;     // number of actual rows
     double* _buffer;  // matrix buffer elements
+    bool _owns_buffer; // true if _buffer is heap-allocated (false when attached to mmap)
 
     UINT32 _maxvalCol; // col of max value
     UINT32 _maxvalRow; // row of max value
 
     UINT32 _matrixType; // full, upper/lower, sparse
+    bool _symmetric;    // only lower triangle populated (upper is mirror)
+    bool _packed;       // packed column-major lower-triangle storage (n*(n+1)/2 elements)
 };
 
 } // namespace math
diff --git a/dynadjust/include/memory/dnafile_mapping.cpp b/dynadjust/include/memory/dnafile_mapping.cpp
index 1a301b315..675e3c600 100644
--- a/dynadjust/include/memory/dnafile_mapping.cpp
+++ b/dynadjust/include/memory/dnafile_mapping.cpp
@@ -22,7 +22,13 @@
 
 #include <include/memory/dnafile_mapping.hpp>
 
-namespace dynadjust { 
+#ifdef __linux__
+#include <cstdint>
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+namespace dynadjust {
 namespace memory {
 
 //block_map_t::block_map_t()
@@ -72,15 +78,33 @@ block_map_t::block_map_t(const block_map_t &p)
 void block_map_t::MapRegion(FileMapPtr file_map_ptr) {
 	region_ptr_.reset(
 		new boost::interprocess::mapped_region(
-			*file_map_ptr, 
-			boost::interprocess::read_write, 
-			region_offset_, 
+			*file_map_ptr,
+			boost::interprocess::read_write,
+			region_offset_,
 			data_size_
 			)
 		);
 }
 
 
+void block_map_t::AdviseSequential() {
+	if (region_ptr_)
+		region_ptr_->advise(boost::interprocess::mapped_region::advice_sequential);
+}
+
+
+void block_map_t::AdviseDontNeed() {
+	if (region_ptr_)
+		region_ptr_->advise(boost::interprocess::mapped_region::advice_dontneed);
+}
+
+
+void block_map_t::AdviseWillNeed() {
+	if (region_ptr_)
+		region_ptr_->advise(boost::interprocess::mapped_region::advice_willneed);
+}
+
+
 // class to hold addresses and sizes for all matrices 
 // in a vector of segmented blocks
 vmat_file_map::vmat_file_map()
@@ -129,10 +153,54 @@ void vmat_file_map::CreateFileMap()
 }
 	
 
-void vmat_file_map::MapRegion(const UINT32 block) 
+void vmat_file_map::MapRegion(const UINT32 block)
 {
 	vblockMapRegions_.at(block).MapRegion(file_map_ptr_);
 }
+
+
+void vmat_file_map::AdviseRegion(const UINT32 block, boost::interprocess::mapped_region::advice_types advice)
+{
+	auto& region = vblockMapRegions_.at(block);
+#ifdef __linux__
+	// Boost's mapped_region::advise calls madvise() without
+	// page-aligning the address.  Stage regions are packed
+	// end-to-end so only the first region's address is page-
+	// aligned by chance; madvise on the rest fails silently
+	// with EINVAL.  Round start up and end down to whole
+	// pages, then call madvise directly.  For DONTNEED also
+	// flush dirty pages asynchronously so the kernel can
+	// evict them promptly.
+	if (region.region_ptr_ && region.data_size_ > 0) {
+		const std::size_t page_size =
+			static_cast<std::size_t>(sysconf(_SC_PAGESIZE));
+		auto base = reinterpret_cast<std::uintptr_t>(
+			region.region_ptr_->get_address());
+		std::uintptr_t aligned_start =
+			(base + page_size - 1) & ~(page_size - 1);
+		std::uintptr_t aligned_end =
+			(base + region.data_size_) & ~(page_size - 1);
+		if (aligned_end > aligned_start) {
+			void* addr = reinterpret_cast<void*>(aligned_start);
+			std::size_t len = aligned_end - aligned_start;
+			int madv = MADV_NORMAL;
+			switch (advice) {
+			case boost::interprocess::mapped_region::advice_sequential:
+				madv = MADV_SEQUENTIAL; break;
+			case boost::interprocess::mapped_region::advice_willneed:
+				madv = MADV_WILLNEED; break;
+			case boost::interprocess::mapped_region::advice_dontneed:
+				msync(addr, len, MS_ASYNC);
+				madv = MADV_DONTNEED; break;
+			default: break;
+			}
+			madvise(addr, len, madv);
+			return;
+		}
+	}
+#endif
+	region.region_ptr_->advise(advice);
+}
 	
 
 }	// namespace memory 
diff --git a/dynadjust/include/memory/dnafile_mapping.hpp b/dynadjust/include/memory/dnafile_mapping.hpp
index ff33a8eb2..abd08859e 100644
--- a/dynadjust/include/memory/dnafile_mapping.hpp
+++ b/dynadjust/include/memory/dnafile_mapping.hpp
@@ -64,12 +64,17 @@ class block_map_t
 	inline size_t GetDataSize() const { return data_size_; }
 	inline size_t GetRegionOffset() const { return region_offset_; }
 	inline size_t GetCumulativeRegionOffset() const { return region_offset_ + data_size_; }
-	
+
 	inline void SetDataSize(const size_t& size) { data_size_ = size; }
 	inline void SetRegionOffset(const size_t& size) { region_offset_ = size; }
-	
+
 	void MapRegion(FileMapPtr file_map_ptr);
 
+	// Memory advisory hints for mapped regions
+	void AdviseSequential();
+	void AdviseDontNeed();
+	void AdviseWillNeed();
+
 	size_t			data_size_;		// Size of this matrix.  
 	size_t			region_offset_;		// Offset from the beginning of the region
 	MapRegPtr		region_ptr_;		// shared pointer to the region
@@ -92,10 +97,11 @@ class vmat_file_map {
 	void setnewFilePath(const std::string& filePath, bool remove_mapped_file);
 	void CreateFileMap();
 	void MapRegion(const UINT32 block);
+	void AdviseRegion(const UINT32 block, boost::interprocess::mapped_region::advice_types advice);
 
 	inline FileMapPtr getFileMapPtr() const { return file_map_ptr_; }
-	inline void* GetBlockRegionAddr(const UINT32 block) const { 
-		return vblockMapRegions_.at(block).region_ptr_->get_address(); 
+	inline void* GetBlockRegionAddr(const UINT32 block) const {
+		return vblockMapRegions_.at(block).region_ptr_->get_address();
 	}
 
 	vmat_file_map(const vmat_file_map&);				// prevent copying
diff --git a/tests/test_matrix.cpp b/tests/test_matrix.cpp
index c96d32dfa..ae7169cfd 100644
--- a/tests/test_matrix.cpp
+++ b/tests/test_matrix.cpp
@@ -22,6 +22,7 @@
 #define TESTING_MAIN
 
 #include <cmath>
+#include <sstream>
 #include <stdexcept>
 
 #include "math/dnamatrix_contiguous.hpp"
@@ -587,3 +588,947 @@ TEST_CASE("Cholesky inverse with LOWER_IS_CLEARED", "[matrix_2d]") {
     REQUIRE(abs(inverse.get(0, 2) - inverse.get(2, 0)) < 1e-10);
     REQUIRE(abs(inverse.get(1, 2) - inverse.get(2, 1)) < 1e-10);
 }
+
+// ============================================================================
+// Symmetric matrix support
+// ============================================================================
+
+TEST_CASE("Symmetric flag default is false", "[matrix_2d][symmetric]") {
+    matrix_2d mat(3, 3);
+    REQUIRE(mat.is_symmetric() == false);
+}
+
+TEST_CASE("Symmetric flag set and get", "[matrix_2d][symmetric]") {
+    matrix_2d mat(3, 3);
+    mat.set_symmetric(true);
+    REQUIRE(mat.is_symmetric() == true);
+    mat.set_symmetric(false);
+    REQUIRE(mat.is_symmetric() == false);
+}
+
+TEST_CASE("Symmetric flag propagates through copy constructor", "[matrix_2d][symmetric]") {
+    matrix_2d mat(3, 3);
+    mat.put(0, 0, 1.0); mat.put(1, 0, 2.0); mat.put(1, 1, 3.0);
+    mat.put(2, 0, 4.0); mat.put(2, 1, 5.0); mat.put(2, 2, 6.0);
+    mat.set_symmetric(true);
+
+    matrix_2d copy(mat);
+    REQUIRE(copy.is_symmetric() == true);
+    REQUIRE(copy.get(0, 1) == 2.0);  // reflects from lower
+}
+
+TEST_CASE("Symmetric flag propagates through operator=", "[matrix_2d][symmetric]") {
+    matrix_2d mat(3, 3);
+    mat.put(0, 0, 1.0); mat.put(1, 0, 2.0); mat.put(1, 1, 3.0);
+    mat.set_symmetric(true);
+
+    matrix_2d other(3, 3);
+    other = mat;
+    REQUIRE(other.is_symmetric() == true);
+    REQUIRE(other.get(0, 1) == 2.0);
+}
+
+TEST_CASE("Symmetric get reflects upper to lower", "[matrix_2d][symmetric]") {
+    // Populate only the lower triangle + diagonal
+    matrix_2d mat(4, 4);
+    mat.put(0, 0, 1.0);
+    mat.put(1, 0, 2.0); mat.put(1, 1, 3.0);
+    mat.put(2, 0, 4.0); mat.put(2, 1, 5.0); mat.put(2, 2, 6.0);
+    mat.put(3, 0, 7.0); mat.put(3, 1, 8.0); mat.put(3, 2, 9.0); mat.put(3, 3, 10.0);
+    mat.set_symmetric(true);
+
+    // Diagonal
+    REQUIRE(mat.get(0, 0) == 1.0);
+    REQUIRE(mat.get(1, 1) == 3.0);
+    REQUIRE(mat.get(2, 2) == 6.0);
+    REQUIRE(mat.get(3, 3) == 10.0);
+
+    // Lower triangle (direct)
+    REQUIRE(mat.get(1, 0) == 2.0);
+    REQUIRE(mat.get(2, 0) == 4.0);
+    REQUIRE(mat.get(3, 2) == 9.0);
+
+    // Upper triangle (reflected from lower)
+    REQUIRE(mat.get(0, 1) == 2.0);
+    REQUIRE(mat.get(0, 2) == 4.0);
+    REQUIRE(mat.get(0, 3) == 7.0);
+    REQUIRE(mat.get(1, 2) == 5.0);
+    REQUIRE(mat.get(1, 3) == 8.0);
+    REQUIRE(mat.get(2, 3) == 9.0);
+}
+
+TEST_CASE("Symmetric getelementref reflects upper to lower", "[matrix_2d][symmetric]") {
+    matrix_2d mat(3, 3);
+    mat.put(0, 0, 10.0);
+    mat.put(1, 0, 20.0); mat.put(1, 1, 30.0);
+    mat.put(2, 0, 40.0); mat.put(2, 1, 50.0); mat.put(2, 2, 60.0);
+    mat.set_symmetric(true);
+
+    // const version
+    const matrix_2d& cmat = mat;
+    REQUIRE(*cmat.getelementref(0, 2) == 40.0);  // reflects (0,2) -> (2,0)
+    REQUIRE(*cmat.getelementref(2, 0) == 40.0);  // direct
+
+    // non-const version
+    REQUIRE(*mat.getelementref(1, 2) == 50.0);   // reflects (1,2) -> (2,1)
+}
+
+TEST_CASE("Symmetric redim clears flag", "[matrix_2d][symmetric]") {
+    matrix_2d mat(3, 3);
+    mat.set_symmetric(true);
+    mat.redim(4, 4);
+    REQUIRE(mat.is_symmetric() == false);
+}
+
+TEST_CASE("Cholesky inverse with mark_symmetric", "[matrix_2d][symmetric]") {
+    matrix_2d mat(3, 3);
+    mat.put(0, 0, 4.0);
+    mat.put(0, 1, -1.0);
+    mat.put(0, 2, -1.0);
+    mat.put(1, 0, -1.0);
+    mat.put(1, 1, 3.0);
+    mat.put(1, 2, -1.0);
+    mat.put(2, 0, -1.0);
+    mat.put(2, 1, -1.0);
+    mat.put(2, 2, 2.0);
+
+    matrix_2d inverse = mat.cholesky_inverse(false, true);
+
+    REQUIRE(inverse.is_symmetric() == true);
+    // Values should match the non-symmetric cholesky inverse
+    REQUIRE(abs(inverse.get(0, 0) - 0.384615) < 0.0001);
+    REQUIRE(abs(inverse.get(1, 1) - 0.538462) < 0.0001);
+    REQUIRE(abs(inverse.get(2, 2) - 0.846154) < 0.0001);
+    // Upper triangle via symmetric reflection
+    REQUIRE(abs(inverse.get(0, 1) - 0.230769) < 0.0001);
+    REQUIRE(abs(inverse.get(0, 2) - 0.307692) < 0.0001);
+    REQUIRE(abs(inverse.get(1, 2) - 0.384615) < 0.0001);
+    // Symmetry
+    REQUIRE(abs(inverse.get(0, 1) - inverse.get(1, 0)) < 1e-10);
+    REQUIRE(abs(inverse.get(0, 2) - inverse.get(2, 0)) < 1e-10);
+    REQUIRE(abs(inverse.get(1, 2) - inverse.get(2, 1)) < 1e-10);
+}
+
+TEST_CASE("Cholesky mark_symmetric matches full inverse", "[matrix_2d][symmetric]") {
+    // Larger 5x5 SPD matrix
+    matrix_2d full(5, 5);
+    matrix_2d sym(5, 5);
+    double spd[] = {
+        10, 1, 2, 0, 1,
+         1, 8, 1, 2, 0,
+         2, 1, 7, 1, 1,
+         0, 2, 1, 6, 1,
+         1, 0, 1, 1, 5
+    };
+    for (UINT32 i = 0; i < 5; ++i)
+        for (UINT32 j = 0; j < 5; ++j) {
+            full.put(i, j, spd[i * 5 + j]);
+            sym.put(i, j, spd[i * 5 + j]);
+        }
+
+    full.cholesky_inverse(false, false);  // full inverse with fillupper
+    sym.cholesky_inverse(false, true);    // symmetric inverse
+
+    REQUIRE(sym.is_symmetric() == true);
+    REQUIRE(full.is_symmetric() == false);
+
+    // All elements should match
+    for (UINT32 i = 0; i < 5; ++i)
+        for (UINT32 j = 0; j < 5; ++j)
+            REQUIRE(abs(sym.get(i, j) - full.get(i, j)) < 1e-12);
+}
+
+TEST_CASE("multiply_sym matches dgemm multiply", "[matrix_2d][symmetric]") {
+    // Create a symmetric 4x4 matrix and a 4x2 rhs
+    matrix_2d A(4, 4);
+    double spd[] = {
+        5, 1, 2, 0,
+        1, 4, 1, 1,
+        2, 1, 6, 1,
+        0, 1, 1, 3
+    };
+    for (UINT32 i = 0; i < 4; ++i)
+        for (UINT32 j = 0; j < 4; ++j)
+            A.put(i, j, spd[i * 4 + j]);
+
+    matrix_2d B(4, 2);
+    B.put(0, 0, 1.0); B.put(0, 1, 5.0);
+    B.put(1, 0, 2.0); B.put(1, 1, 6.0);
+    B.put(2, 0, 3.0); B.put(2, 1, 7.0);
+    B.put(3, 0, 4.0); B.put(3, 1, 8.0);
+
+    // dgemm result
+    matrix_2d C_full(4, 2);
+    C_full.multiply(A, "N", B, "N");
+
+    // dsymm result — only lower triangle in A_sym
+    matrix_2d A_sym(4, 4);
+    for (UINT32 i = 0; i < 4; ++i)
+        for (UINT32 j = 0; j <= i; ++j)
+            A_sym.put(i, j, spd[i * 4 + j]);
+    A_sym.set_symmetric(true);
+
+    matrix_2d C_sym(4, 2);
+    C_sym.multiply_sym(A_sym, B);
+
+    for (UINT32 i = 0; i < 4; ++i)
+        for (UINT32 j = 0; j < 2; ++j)
+            REQUIRE(abs(C_sym.get(i, j) - C_full.get(i, j)) < 1e-12);
+}
+
+TEST_CASE("multiply_sym with column vector", "[matrix_2d][symmetric]") {
+    // This mirrors the Solve() hot path: N_inv * At_Vinv_m
+    matrix_2d N(3, 3);
+    N.put(0, 0, 4.0);
+    N.put(0, 1, -1.0);
+    N.put(0, 2, -1.0);
+    N.put(1, 0, -1.0);
+    N.put(1, 1, 3.0);
+    N.put(1, 2, -1.0);
+    N.put(2, 0, -1.0);
+    N.put(2, 1, -1.0);
+    N.put(2, 2, 2.0);
+
+    // Full inverse
+    matrix_2d N_full(N);
+    N_full.cholesky_inverse(false, false);
+
+    // Symmetric inverse
+    matrix_2d N_sym(N);
+    N_sym.cholesky_inverse(false, true);
+
+    matrix_2d rhs(3, 1);
+    rhs.put(0, 0, 1.0);
+    rhs.put(1, 0, 2.0);
+    rhs.put(2, 0, 3.0);
+
+    // dgemm
+    matrix_2d result_full(3, 1);
+    result_full.multiply(N_full, "N", rhs, "N");
+
+    // dsymm
+    matrix_2d result_sym(3, 1);
+    result_sym.multiply_sym(N_sym, rhs);
+
+    for (UINT32 i = 0; i < 3; ++i)
+        REQUIRE(abs(result_sym.get(i, 0) - result_full.get(i, 0)) < 1e-12);
+}
+
+TEST_CASE("Symmetric copyelements 3x3 from lower triangle", "[matrix_2d][symmetric]") {
+    matrix_2d src(6, 6);
+    // Fill lower triangle only
+    for (UINT32 i = 0; i < 6; ++i)
+        for (UINT32 j = 0; j <= i; ++j)
+            src.put(i, j, static_cast<double>(i * 10 + j));
+    src.set_symmetric(true);
+
+    matrix_2d dst(3, 3);
+
+    // Copy from lower triangle region (row >= col) — should work directly
+    dst.copyelements(0, 0, src, 3, 0, 3, 3);
+    REQUIRE(dst.get(0, 0) == 30.0);
+    REQUIRE(dst.get(1, 0) == 40.0);
+    REQUIRE(dst.get(2, 0) == 50.0);
+    REQUIRE(dst.get(0, 1) == 31.0);
+    REQUIRE(dst.get(1, 1) == 41.0);
+    REQUIRE(dst.get(2, 1) == 51.0);
+    REQUIRE(dst.get(0, 2) == 32.0);
+    REQUIRE(dst.get(1, 2) == 42.0);
+    REQUIRE(dst.get(2, 2) == 52.0);
+}
+
+TEST_CASE("Symmetric copyelements 3x3 from upper triangle", "[matrix_2d][symmetric]") {
+    matrix_2d src(6, 6);
+    // Fill lower triangle only: src(i,j) = i*10+j for j<=i
+    for (UINT32 i = 0; i < 6; ++i)
+        for (UINT32 j = 0; j <= i; ++j)
+            src.put(i, j, static_cast<double>(i * 10 + j));
+    src.set_symmetric(true);
+
+    matrix_2d dst(3, 3);
+
+    // Copy from upper triangle region (row_src=0, col_src=3)
+    // dst(r,c) = src.get(r, 3+c) which reflects to src(3+c, r) = (3+c)*10 + r
+    dst.copyelements(0, 0, src, 0, 3, 3, 3);
+    REQUIRE(dst.get(0, 0) == 30.0);  // src.get(0,3) = src(3,0) = 30
+    REQUIRE(dst.get(1, 0) == 31.0);  // src.get(1,3) = src(3,1) = 31
+    REQUIRE(dst.get(2, 0) == 32.0);  // src.get(2,3) = src(3,2) = 32
+    REQUIRE(dst.get(0, 1) == 40.0);  // src.get(0,4) = src(4,0) = 40
+    REQUIRE(dst.get(1, 1) == 41.0);  // src.get(1,4) = src(4,1) = 41
+    REQUIRE(dst.get(2, 1) == 42.0);  // src.get(2,4) = src(4,2) = 42
+    REQUIRE(dst.get(0, 2) == 50.0);  // src.get(0,5) = src(5,0) = 50
+    REQUIRE(dst.get(1, 2) == 51.0);  // src.get(1,5) = src(5,1) = 51
+    REQUIRE(dst.get(2, 2) == 52.0);  // src.get(2,5) = src(5,2) = 52
+}
+
+TEST_CASE("Symmetric blockadd 3x3 from upper triangle", "[matrix_2d][symmetric]") {
+    matrix_2d src(6, 6);
+    for (UINT32 i = 0; i < 6; ++i)
+        for (UINT32 j = 0; j <= i; ++j)
+            src.put(i, j, static_cast<double>(i * 10 + j));
+    src.set_symmetric(true);
+
+    matrix_2d dst(3, 3);
+    // Pre-fill with 1.0 to verify addition
+    for (UINT32 i = 0; i < 3; ++i)
+        for (UINT32 j = 0; j < 3; ++j)
+            dst.put(i, j, 1.0);
+
+    // blockadd from upper triangle (row_src=0, col_src=3)
+    dst.blockadd(0, 0, src, 0, 3, 3, 3);
+
+    // dst(r,c) = 1.0 + src.get(r, 3+c)
+    REQUIRE(dst.get(0, 0) == 31.0);  // 1 + 30
+    REQUIRE(dst.get(1, 0) == 32.0);  // 1 + 31
+    REQUIRE(dst.get(2, 0) == 33.0);  // 1 + 32
+    REQUIRE(dst.get(0, 1) == 41.0);  // 1 + 40
+    REQUIRE(dst.get(1, 1) == 42.0);  // 1 + 41
+    REQUIRE(dst.get(2, 1) == 43.0);  // 1 + 42
+    REQUIRE(dst.get(0, 2) == 51.0);  // 1 + 50
+    REQUIRE(dst.get(1, 2) == 52.0);  // 1 + 51
+    REQUIRE(dst.get(2, 2) == 53.0);  // 1 + 52
+}
+
+TEST_CASE("Symmetric copyelements generic from upper triangle", "[matrix_2d][symmetric]") {
+    matrix_2d src(8, 8);
+    for (UINT32 i = 0; i < 8; ++i)
+        for (UINT32 j = 0; j <= i; ++j)
+            src.put(i, j, static_cast<double>(i * 10 + j));
+    src.set_symmetric(true);
+
+    // Copy a 4x4 block from upper triangle (row=0, col=4)
+    matrix_2d dst(4, 4);
+    dst.copyelements(0, 0, src, 0, 4, 4, 4);
+
+    // Verify via get() which handles reflection
+    for (UINT32 i = 0; i < 4; ++i)
+        for (UINT32 j = 0; j < 4; ++j)
+            REQUIRE(abs(dst.get(i, j) - src.get(i, 4 + j)) < 1e-15);
+}
+
+TEST_CASE("End-to-end: cholesky mark_symmetric then multiply_sym", "[matrix_2d][symmetric]") {
+    // Simulate the full Solve() path
+    matrix_2d N(3, 3);
+    N.put(0, 0, 4.0);  N.put(0, 1, -1.0); N.put(0, 2, -1.0);
+    N.put(1, 0, -1.0); N.put(1, 1, 3.0);  N.put(1, 2, -1.0);
+    N.put(2, 0, -1.0); N.put(2, 1, -1.0); N.put(2, 2, 2.0);
+
+    matrix_2d N_ref(N);
+
+    // Reference: full inverse + dgemm
+    N_ref.cholesky_inverse(false, false);
+
+    matrix_2d rhs(3, 1);
+    rhs.put(0, 0, 10.0);
+    rhs.put(1, 0, 20.0);
+    rhs.put(2, 0, 30.0);
+
+    matrix_2d ref_result(3, 1);
+    ref_result.multiply(N_ref, "N", rhs, "N");
+
+    // Test: symmetric inverse + dsymm
+    N.cholesky_inverse(false, true);
+    REQUIRE(N.is_symmetric() == true);
+
+    matrix_2d sym_result(3, 1);
+    sym_result.multiply_sym(N, rhs);
+
+    for (UINT32 i = 0; i < 3; ++i)
+        REQUIRE(abs(sym_result.get(i, 0) - ref_result.get(i, 0)) < 1e-12);
+}
+
+// ============================================================================
+// Packed symmetric matrix storage
+// ============================================================================
+
+TEST_CASE("Packed allocation has correct buffer size", "[matrix_2d][packed]") {
+    matrix_2d mat;
+    mat.redim_packed(4);
+    REQUIRE(mat.rows() == 4);
+    REQUIRE(mat.columns() == 4);
+    REQUIRE(mat.is_packed() == true);
+    REQUIRE(mat.is_symmetric() == true);
+    // n*(n+1)/2 = 10 elements
+    REQUIRE(mat.memRows() == 4);
+}
+
+TEST_CASE("Packed element access: put and get reflect symmetry", "[matrix_2d][packed]") {
+    matrix_2d mat;
+    mat.redim_packed(4);
+
+    mat.put(0, 0, 1.0);
+    mat.put(1, 0, 2.0); mat.put(1, 1, 3.0);
+    mat.put(2, 0, 4.0); mat.put(2, 1, 5.0); mat.put(2, 2, 6.0);
+    mat.put(3, 0, 7.0); mat.put(3, 1, 8.0); mat.put(3, 2, 9.0); mat.put(3, 3, 10.0);
+
+    // Diagonal
+    REQUIRE(mat.get(0, 0) == 1.0);
+    REQUIRE(mat.get(1, 1) == 3.0);
+    REQUIRE(mat.get(2, 2) == 6.0);
+    REQUIRE(mat.get(3, 3) == 10.0);
+
+    // Lower triangle (direct)
+    REQUIRE(mat.get(1, 0) == 2.0);
+    REQUIRE(mat.get(2, 0) == 4.0);
+    REQUIRE(mat.get(3, 2) == 9.0);
+
+    // Upper triangle (reflected from lower)
+    REQUIRE(mat.get(0, 1) == 2.0);
+    REQUIRE(mat.get(0, 2) == 4.0);
+    REQUIRE(mat.get(0, 3) == 7.0);
+    REQUIRE(mat.get(1, 2) == 5.0);
+    REQUIRE(mat.get(1, 3) == 8.0);
+    REQUIRE(mat.get(2, 3) == 9.0);
+}
+
+TEST_CASE("Packed put via upper triangle maps to lower", "[matrix_2d][packed]") {
+    matrix_2d mat;
+    mat.redim_packed(3);
+
+    mat.put(0, 1, 42.0);
+    REQUIRE(mat.get(1, 0) == 42.0);
+    REQUIRE(mat.get(0, 1) == 42.0);
+}
+
+TEST_CASE("Packed elementadd skips upper triangle", "[matrix_2d][packed]") {
+    matrix_2d mat;
+    mat.redim_packed(3);
+
+    mat.elementadd(1, 0, 10.0);
+    mat.elementadd(0, 1, 5.0);  // skipped — upper triangle
+    REQUIRE(mat.get(1, 0) == 10.0);
+    REQUIRE(mat.get(0, 1) == 10.0);
+
+    mat.elementadd(1, 0, 3.0);  // lower triangle — accumulates
+    REQUIRE(mat.get(1, 0) == 13.0);
+}
+
+TEST_CASE("Packed cholesky_inverse matches full inverse", "[matrix_2d][packed]") {
+    matrix_2d full(3, 3);
+    full.put(0, 0, 4.0);  full.put(0, 1, -1.0); full.put(0, 2, -1.0);
+    full.put(1, 0, -1.0); full.put(1, 1, 3.0);  full.put(1, 2, -1.0);
+    full.put(2, 0, -1.0); full.put(2, 1, -1.0); full.put(2, 2, 2.0);
+
+    matrix_2d packed;
+    packed.redim_packed(3);
+    packed.put(0, 0, 4.0);
+    packed.put(1, 0, -1.0); packed.put(1, 1, 3.0);
+    packed.put(2, 0, -1.0); packed.put(2, 1, -1.0); packed.put(2, 2, 2.0);
+
+    full.cholesky_inverse();
+    packed.cholesky_inverse();
+
+    REQUIRE(packed.is_packed() == true);
+    for (UINT32 i = 0; i < 3; ++i)
+        for (UINT32 j = 0; j < 3; ++j)
+            REQUIRE(abs(packed.get(i, j) - full.get(i, j)) < 1e-10);
+}
+
+TEST_CASE("Packed cholesky_inverse throws on indefinite matrix", "[matrix_2d][packed]") {
+    matrix_2d mat;
+    mat.redim_packed(2);
+    mat.put(0, 0, 2.0);
+    mat.put(1, 0, 1.0);
+    mat.put(1, 1, -1.0);
+
+    bool caught = false;
+    try { mat.cholesky_inverse(); }
+    catch (const MatrixInversionFailure&) { caught = true; }
+    REQUIRE(caught);
+}
+
+TEST_CASE("Packed multiply_sym with vector matches full result", "[matrix_2d][packed]") {
+    matrix_2d full(3, 3);
+    full.put(0, 0, 4.0);  full.put(0, 1, -1.0); full.put(0, 2, -1.0);
+    full.put(1, 0, -1.0); full.put(1, 1, 3.0);  full.put(1, 2, -1.0);
+    full.put(2, 0, -1.0); full.put(2, 1, -1.0); full.put(2, 2, 2.0);
+
+    matrix_2d packed;
+    packed.redim_packed(3);
+    packed.put(0, 0, 4.0);
+    packed.put(1, 0, -1.0); packed.put(1, 1, 3.0);
+    packed.put(2, 0, -1.0); packed.put(2, 1, -1.0); packed.put(2, 2, 2.0);
+
+    matrix_2d rhs(3, 1);
+    rhs.put(0, 0, 1.0); rhs.put(1, 0, 2.0); rhs.put(2, 0, 3.0);
+
+    matrix_2d result_full(3, 1);
+    result_full.multiply(full, "N", rhs, "N");
+
+    matrix_2d result_packed(3, 1);
+    result_packed.multiply_sym(packed, rhs);
+
+    for (UINT32 i = 0; i < 3; ++i)
+        REQUIRE(abs(result_packed.get(i, 0) - result_full.get(i, 0)) < 1e-12);
+}
+
+TEST_CASE("Packed multiply_sym with multi-column rhs", "[matrix_2d][packed]") {
+    matrix_2d packed;
+    packed.redim_packed(4);
+    double spd[] = {5, 1, 2, 0, 1, 4, 1, 1, 2, 1, 6, 1, 0, 1, 1, 3};
+    for (UINT32 i = 0; i < 4; ++i)
+        for (UINT32 j = 0; j <= i; ++j)
+            packed.put(i, j, spd[i * 4 + j]);
+
+    matrix_2d full(4, 4);
+    for (UINT32 i = 0; i < 4; ++i)
+        for (UINT32 j = 0; j < 4; ++j)
+            full.put(i, j, spd[i * 4 + j]);
+
+    matrix_2d B(4, 2);
+    B.put(0, 0, 1.0); B.put(0, 1, 5.0);
+    B.put(1, 0, 2.0); B.put(1, 1, 6.0);
+    B.put(2, 0, 3.0); B.put(2, 1, 7.0);
+    B.put(3, 0, 4.0); B.put(3, 1, 8.0);
+
+    matrix_2d C_full(4, 2);
+    C_full.multiply(full, "N", B, "N");
+
+    matrix_2d C_packed(4, 2);
+    C_packed.multiply_sym(packed, B);
+
+    for (UINT32 i = 0; i < 4; ++i)
+        for (UINT32 j = 0; j < 2; ++j)
+            REQUIRE(abs(C_packed.get(i, j) - C_full.get(i, j)) < 1e-12);
+}
+
+TEST_CASE("Packed scale_symmetric_diagonal", "[matrix_2d][packed]") {
+    matrix_2d packed;
+    packed.redim_packed(3);
+    packed.put(0, 0, 4.0);
+    packed.put(1, 0, -1.0); packed.put(1, 1, 3.0);
+    packed.put(2, 0, -1.0); packed.put(2, 1, -1.0); packed.put(2, 2, 2.0);
+
+    matrix_2d full(3, 3);
+    full.put(0, 0, 4.0);  full.put(0, 1, -1.0); full.put(0, 2, -1.0);
+    full.put(1, 0, -1.0); full.put(1, 1, 3.0);  full.put(1, 2, -1.0);
+    full.put(2, 0, -1.0); full.put(2, 1, -1.0); full.put(2, 2, 2.0);
+    full.set_symmetric(true);
+
+    double diag[] = {2.0, 3.0, 0.5};
+    packed.scale_symmetric_diagonal(diag);
+    full.scale_symmetric_diagonal(diag);
+
+    for (UINT32 i = 0; i < 3; ++i)
+        for (UINT32 j = 0; j < 3; ++j)
+            REQUIRE(abs(packed.get(i, j) - full.get(i, j)) < 1e-12);
+}
+
+TEST_CASE("Packed blockadd diagonal block", "[matrix_2d][packed]") {
+    matrix_2d packed;
+    packed.redim_packed(4);
+
+    // Diagonal blocks only write lower triangle via elementadd
+    // (mirrors how AddMsrtoNormalsVar works)
+    packed.elementadd(1, 1, 10.0);
+    packed.elementadd(2, 1, 20.0);
+    packed.elementadd(2, 2, 40.0);
+
+    REQUIRE(packed.get(1, 1) == 10.0);
+    REQUIRE(packed.get(2, 1) == 20.0);
+    REQUIRE(packed.get(1, 2) == 20.0);
+    REQUIRE(packed.get(2, 2) == 40.0);
+    REQUIRE(packed.get(0, 0) == 0.0);
+}
+
+TEST_CASE("Packed submatrix extraction", "[matrix_2d][packed]") {
+    matrix_2d packed;
+    packed.redim_packed(4);
+    packed.put(0, 0, 1.0);
+    packed.put(1, 0, 2.0); packed.put(1, 1, 3.0);
+    packed.put(2, 0, 4.0); packed.put(2, 1, 5.0); packed.put(2, 2, 6.0);
+    packed.put(3, 0, 7.0); packed.put(3, 1, 8.0); packed.put(3, 2, 9.0); packed.put(3, 3, 10.0);
+
+    matrix_2d sub = packed.submatrix(1, 1, 2, 2);
+    REQUIRE(sub.rows() == 2);
+    REQUIRE(sub.columns() == 2);
+    REQUIRE(sub.get(0, 0) == 3.0);
+    REQUIRE(sub.get(0, 1) == 5.0);
+    REQUIRE(sub.get(1, 0) == 5.0);
+    REQUIRE(sub.get(1, 1) == 6.0);
+}
+
+TEST_CASE("Packed copy constructor", "[matrix_2d][packed]") {
+    matrix_2d orig;
+    orig.redim_packed(3);
+    orig.put(0, 0, 1.0);
+    orig.put(1, 0, 2.0); orig.put(1, 1, 3.0);
+    orig.put(2, 0, 4.0); orig.put(2, 1, 5.0); orig.put(2, 2, 6.0);
+
+    matrix_2d copy(orig);
+    REQUIRE(copy.is_packed() == true);
+    REQUIRE(copy.is_symmetric() == true);
+    for (UINT32 i = 0; i < 3; ++i)
+        for (UINT32 j = 0; j < 3; ++j)
+            REQUIRE(copy.get(i, j) == orig.get(i, j));
+}
+
+TEST_CASE("Packed operator= packed-to-packed", "[matrix_2d][packed]") {
+    matrix_2d a;
+    a.redim_packed(3);
+    a.put(0, 0, 10.0); a.put(1, 0, 20.0); a.put(1, 1, 30.0);
+    a.put(2, 0, 40.0); a.put(2, 1, 50.0); a.put(2, 2, 60.0);
+
+    matrix_2d b;
+    b.redim_packed(2);
+    b = a;
+    REQUIRE(b.is_packed() == true);
+    REQUIRE(b.rows() == 3);
+    for (UINT32 i = 0; i < 3; ++i)
+        for (UINT32 j = 0; j < 3; ++j)
+            REQUIRE(b.get(i, j) == a.get(i, j));
+}
+
+TEST_CASE("Packed operator= full-to-packed preserves full", "[matrix_2d][packed]") {
+    matrix_2d full(3, 3);
+    full.put(0, 0, 1.0); full.put(0, 1, 2.0); full.put(0, 2, 3.0);
+    full.put(1, 0, 4.0); full.put(1, 1, 5.0); full.put(1, 2, 6.0);
+    full.put(2, 0, 7.0); full.put(2, 1, 8.0); full.put(2, 2, 9.0);
+
+    matrix_2d packed;
+    packed.redim_packed(3);
+    packed = full;
+    // Assigning a non-packed to a packed should convert to non-packed
+    REQUIRE(packed.is_packed() == false);
+    for (UINT32 i = 0; i < 3; ++i)
+        for (UINT32 j = 0; j < 3; ++j)
+            REQUIRE(packed.get(i, j) == full.get(i, j));
+}
+
+TEST_CASE("Packed redim_packed reuses buffer when shrinking", "[matrix_2d][packed]") {
+    matrix_2d mat;
+    mat.redim_packed(5);
+    mat.put(0, 0, 99.0);
+
+    mat.redim_packed(3);
+    REQUIRE(mat.rows() == 3);
+    REQUIRE(mat.is_packed() == true);
+    REQUIRE(mat.get(0, 0) == 0.0);
+}
+
+TEST_CASE("Packed zero", "[matrix_2d][packed]") {
+    matrix_2d mat;
+    mat.redim_packed(3);
+    mat.put(0, 0, 1.0); mat.put(1, 0, 2.0); mat.put(2, 2, 3.0);
+    mat.zero();
+    for (UINT32 i = 0; i < 3; ++i)
+        for (UINT32 j = 0; j < 3; ++j)
+            REQUIRE(mat.get(i, j) == 0.0);
+}
+
+TEST_CASE("Packed scale", "[matrix_2d][packed]") {
+    matrix_2d mat;
+    mat.redim_packed(3);
+    mat.put(0, 0, 2.0);
+    mat.put(1, 0, 3.0); mat.put(1, 1, 4.0);
+    mat.put(2, 0, 5.0); mat.put(2, 1, 6.0); mat.put(2, 2, 7.0);
+    mat.scale(2.0);
+
+    REQUIRE(mat.get(0, 0) == 4.0);
+    REQUIRE(mat.get(1, 0) == 6.0);
+    REQUIRE(mat.get(2, 2) == 14.0);
+    REQUIRE(mat.get(0, 1) == 6.0); // reflected
+}
+
+TEST_CASE("Packed end-to-end: cholesky then multiply_sym", "[matrix_2d][packed]") {
+    matrix_2d full(3, 3);
+    full.put(0, 0, 4.0);  full.put(0, 1, -1.0); full.put(0, 2, -1.0);
+    full.put(1, 0, -1.0); full.put(1, 1, 3.0);  full.put(1, 2, -1.0);
+    full.put(2, 0, -1.0); full.put(2, 1, -1.0); full.put(2, 2, 2.0);
+
+    matrix_2d packed;
+    packed.redim_packed(3);
+    packed.put(0, 0, 4.0);
+    packed.put(1, 0, -1.0); packed.put(1, 1, 3.0);
+    packed.put(2, 0, -1.0); packed.put(2, 1, -1.0); packed.put(2, 2, 2.0);
+
+    // Reference
+    full.cholesky_inverse(false, false);
+    matrix_2d rhs(3, 1);
+    rhs.put(0, 0, 10.0); rhs.put(1, 0, 20.0); rhs.put(2, 0, 30.0);
+    matrix_2d ref_result(3, 1);
+    ref_result.multiply(full, "N", rhs, "N");
+
+    // Packed path
+    packed.cholesky_inverse();
+    matrix_2d packed_result(3, 1);
+    packed_result.multiply_sym(packed, rhs);
+
+    for (UINT32 i = 0; i < 3; ++i)
+        REQUIRE(abs(packed_result.get(i, 0) - ref_result.get(i, 0)) < 1e-12);
+}
+
+TEST_CASE("Packed copyelements from packed source", "[matrix_2d][packed]") {
+    matrix_2d packed;
+    packed.redim_packed(6);
+    for (UINT32 i = 0; i < 6; ++i)
+        for (UINT32 j = 0; j <= i; ++j)
+            packed.put(i, j, static_cast<double>(i * 10 + j));
+
+    matrix_2d dst(3, 3);
+    dst.copyelements(0, 0, packed, 3, 0, 3, 3);
+
+    REQUIRE(dst.get(0, 0) == 30.0);
+    REQUIRE(dst.get(1, 0) == 40.0);
+    REQUIRE(dst.get(2, 0) == 50.0);
+    REQUIRE(dst.get(0, 1) == 31.0);
+    REQUIRE(dst.get(1, 1) == 41.0);
+    REQUIRE(dst.get(2, 1) == 51.0);
+}
+
+TEST_CASE("Packed 5x5 cholesky_inverse matches full", "[matrix_2d][packed]") {
+    double spd[] = {
+        10, 1, 2, 0, 1,
+         1, 8, 1, 2, 0,
+         2, 1, 7, 1, 1,
+         0, 2, 1, 6, 1,
+         1, 0, 1, 1, 5
+    };
+
+    matrix_2d full(5, 5);
+    matrix_2d packed;
+    packed.redim_packed(5);
+
+    for (UINT32 i = 0; i < 5; ++i)
+        for (UINT32 j = 0; j < 5; ++j) {
+            full.put(i, j, spd[i * 5 + j]);
+            if (j <= i)
+                packed.put(i, j, spd[i * 5 + j]);
+        }
+
+    full.cholesky_inverse(false, false);
+    packed.cholesky_inverse();
+
+    for (UINT32 i = 0; i < 5; ++i)
+        for (UINT32 j = 0; j < 5; ++j)
+            REQUIRE(abs(packed.get(i, j) - full.get(i, j)) < 1e-12);
+}
+
+// ================================================================
+// In-place mmap buffer tests (AttachMappedFileRegion / DetachMappedFileRegion)
+// ================================================================
+
+TEST_CASE("In-place mmap round-trip: full matrix", "[matrix_2d][mmap]") {
+    // Create a 4x3 full matrix
+    matrix_2d orig(4, 3);
+    for (UINT32 r = 0; r < 4; ++r)
+        for (UINT32 c = 0; c < 3; ++c)
+            orig.put(r, c, (r + 1) * 10.0 + c);
+    orig.compute_maximum_value();
+
+    // Allocate a buffer to simulate an mmap region
+    std::size_t region_size = orig.get_size();
+    std::vector<char> region(region_size, 0);
+    void* addr = region.data();
+
+    // Write the matrix to the "mmap" region
+    orig.WriteMappedFileRegion(addr);
+
+    // Attach in-place — should point _buffer at the data in the region
+    matrix_2d attached;
+    attached.AttachMappedFileRegion(addr);
+
+    REQUIRE(!attached.owns_buffer());
+    REQUIRE(attached.rows() == 4);
+    REQUIRE(attached.columns() == 3);
+    REQUIRE(attached.matrixType() == mtx_full);
+
+    // Verify all elements match
+    for (UINT32 r = 0; r < 4; ++r)
+        for (UINT32 c = 0; c < 3; ++c)
+            REQUIRE(attached.get(r, c) == orig.get(r, c));
+
+    // Modify in-place (writes directly to region)
+    attached.put(2, 1, 999.0);
+    REQUIRE(attached.get(2, 1) == 999.0);
+
+    // Detach
+    attached.DetachMappedFileRegion(addr);
+    REQUIRE(attached.empty());
+    REQUIRE(attached.owns_buffer());
+
+    // Read back with copy-based Read to verify data persisted
+    matrix_2d readback;
+    readback.ReadMappedFileRegion(addr);
+    REQUIRE(readback.get(2, 1) == 999.0);
+    REQUIRE(readback.get(0, 0) == 10.0);
+}
+
+TEST_CASE("In-place mmap round-trip: packed lower-triangular", "[matrix_2d][mmap][packed]") {
+    // Create a 4x4 packed lower-triangular matrix
+    matrix_2d orig;
+    orig.redim_packed(4);
+    for (UINT32 i = 0; i < 4; ++i)
+        for (UINT32 j = 0; j <= i; ++j)
+            orig.put(i, j, (i + 1) * 10.0 + j);
+    orig.compute_maximum_value();
+
+    std::size_t region_size = orig.get_size();
+    std::vector<char> region(region_size, 0);
+    void* addr = region.data();
+
+    orig.WriteMappedFileRegion(addr);
+
+    matrix_2d attached;
+    attached.AttachMappedFileRegion(addr);
+
+    REQUIRE(!attached.owns_buffer());
+    REQUIRE(attached.is_packed());
+    REQUIRE(attached.is_symmetric());
+    REQUIRE(attached.rows() == 4);
+
+    // Verify all elements
+    for (UINT32 i = 0; i < 4; ++i)
+        for (UINT32 j = 0; j <= i; ++j)
+            REQUIRE(attached.get(i, j) == orig.get(i, j));
+
+    // Symmetry: upper = lower
+    REQUIRE(attached.get(0, 1) == attached.get(1, 0));
+
+    // Modify in-place
+    attached.put(3, 0, -7.5);
+
+    // WriteMappedFileRegion on in-place buffer should only write footer
+    attached.WriteMappedFileRegion(addr);
+
+    // deallocate detaches without freeing
+    attached.deallocate();
+    REQUIRE(attached.empty());
+    REQUIRE(attached.owns_buffer());
+
+    // Read back to verify
+    matrix_2d readback;
+    readback.ReadMappedFileRegion(addr);
+    REQUIRE(readback.get(3, 0) == -7.5);
+    REQUIRE(readback.get(1, 0) == 20.0);
+}
+
+TEST_CASE("In-place mmap: deallocate does not crash", "[matrix_2d][mmap]") {
+    matrix_2d orig(3, 3);
+    for (UINT32 i = 0; i < 9; ++i)
+        orig.put(i / 3, i % 3, i + 1.0);
+
+    std::size_t region_size = orig.get_size();
+    std::vector<char> region(region_size, 0);
+    void* addr = region.data();
+    orig.WriteMappedFileRegion(addr);
+
+    matrix_2d attached;
+    attached.AttachMappedFileRegion(addr);
+    REQUIRE(!attached.owns_buffer());
+
+    // deallocate should not crash (no delete[] on mmap pointer)
+    attached.deallocate();
+    REQUIRE(attached.empty());
+    REQUIRE(attached.owns_buffer());
+
+    // Re-attach and verify data still intact
+    attached.AttachMappedFileRegion(addr);
+    REQUIRE(attached.get(1, 1) == 5.0);
+    attached.deallocate();
+}
+
+TEST_CASE("In-place mmap: copy constructor makes owned copy", "[matrix_2d][mmap]") {
+    matrix_2d orig(2, 2);
+    orig.put(0, 0, 1.0); orig.put(0, 1, 2.0);
+    orig.put(1, 0, 3.0); orig.put(1, 1, 4.0);
+
+    std::size_t region_size = orig.get_size();
+    std::vector<char> region(region_size, 0);
+    void* addr = region.data();
+    orig.WriteMappedFileRegion(addr);
+
+    matrix_2d attached;
+    attached.AttachMappedFileRegion(addr);
+
+    // Copy constructor should create an owned copy
+    matrix_2d copy(attached);
+    REQUIRE(copy.owns_buffer());
+    REQUIRE(copy.get(1, 1) == 4.0);
+
+    // Modifying copy should not affect mmap
+    copy.put(1, 1, 99.0);
+    REQUIRE(attached.get(1, 1) == 4.0);
+
+    attached.deallocate();
+}
+
+TEST_CASE("In-place mmap: operator= into owned buffer", "[matrix_2d][mmap]") {
+    matrix_2d orig(3, 2);
+    for (UINT32 r = 0; r < 3; ++r)
+        for (UINT32 c = 0; c < 2; ++c)
+            orig.put(r, c, r * 2.0 + c);
+
+    std::size_t region_size = orig.get_size();
+    std::vector<char> region(region_size, 0);
+    void* addr = region.data();
+    orig.WriteMappedFileRegion(addr);
+
+    matrix_2d attached;
+    attached.AttachMappedFileRegion(addr);
+
+    // Assign to a pre-allocated owned matrix of same size
+    matrix_2d dest(3, 2);
+    dest = attached;
+    REQUIRE(dest.owns_buffer());
+    REQUIRE(dest.get(2, 1) == 5.0);
+
+    attached.deallocate();
+}
+
+TEST_CASE("In-place mmap: region size alignment", "[matrix_2d][mmap]") {
+    // Verify that get_size() returns 8-byte aligned sizes for proper mmap alignment
+    matrix_2d full(7, 5);
+    REQUIRE(full.get_size() % 8 == 0);
+
+    matrix_2d packed;
+    packed.redim_packed(11);
+    REQUIRE(packed.get_size() % 8 == 0);
+
+    matrix_2d col_vec(100, 1);
+    REQUIRE(col_vec.get_size() % 8 == 0);
+}
+
+TEST_CASE("operator<< binary size matches get_size for mmap regions", "[matrix_2d][mmap]") {
+    // Verify that operator<< writes exactly get_size() bytes in binary mode.
+    // A mismatch here causes mmap region offsets to be wrong for all blocks
+    // after the first, leading to corrupted header reads.
+
+    auto check_stream_size = [](matrix_2d& m, const std::string& label) {
+        std::ostringstream oss;
+        oss.iword(0) = binary;
+        oss << m;
+        std::size_t stream_bytes = oss.str().size();
+        std::size_t region_bytes = m.get_size();
+        REQUIRE(stream_bytes == region_bytes);
+    };
+
+    // Full matrix (column vector — the case that crashed)
+    matrix_2d col_vec(50, 1);
+    for (UINT32 r = 0; r < 50; ++r)
+        col_vec.put(r, 0, static_cast<double>(r));
+    check_stream_size(col_vec, "column vector");
+
+    // Full matrix
+    matrix_2d full(7, 5);
+    for (UINT32 r = 0; r < 7; ++r)
+        for (UINT32 c = 0; c < 5; ++c)
+            full.put(r, c, static_cast<double>(r * 10 + c));
+    check_stream_size(full, "full 7x5");
+
+    // Packed lower-triangular
+    matrix_2d packed;
+    packed.redim_packed(6);
+    for (UINT32 r = 0; r < 6; ++r)
+        for (UINT32 c = 0; c <= r; ++c)
+            packed.put(r, c, static_cast<double>(r * 10 + c));
+    check_stream_size(packed, "packed 6x6");
+
+    // Full matrix with slack (redim + shrink, simulating staged adjustment)
+    matrix_2d with_slack;
+    with_slack.redim(100, 1);
+    with_slack.shrink(30, 0);  // _rows=70, _mem_rows=100
+    for (UINT32 r = 0; r < 70; ++r)
+        with_slack.put(r, 0, static_cast<double>(r));
+    check_stream_size(with_slack, "column vector with slack");
+}