From a7f0cfb91834a42755a714e220bd5742f0dcd5b9 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Wed, 17 Sep 2025 12:48:33 +0100 Subject: [PATCH] Fix metadata encoding issue --- CHANGELOG.rst | 4 + tests/files/issue95_metadata_bug.tsz | Bin 0 -> 37624 bytes tests/files/issue95_metadata_dtype.trees | Bin 0 -> 10996 bytes tests/test_compression.py | 115 +++++++++++++++++++++++ tszip/compression.py | 12 ++- 5 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 tests/files/issue95_metadata_bug.tsz create mode 100644 tests/files/issue95_metadata_dtype.trees diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 936ecdc..31ffaac 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,10 @@ [0.2.5] - 2025-XX-XX -------------------- +- In previous versions, if a metadata column had byte values outside the ASCII range, + the file written would raise a `ValueError` when decompressed. This is now fixed, + and files written with this bug are now read correctly. + (benjeffery, #115) - Drop Python 3.9 support, require Python >= 3.10 (#112, benjeffery) - Support zarr v3 (#114, benjeffery) diff --git a/tests/files/issue95_metadata_bug.tsz b/tests/files/issue95_metadata_bug.tsz new file mode 100644 index 0000000000000000000000000000000000000000..a16d102ee92a0937bf5350874424a151edbeb115 GIT binary patch literal 37624 zcmeHQ2V7Ij(oawn>Gfe0t7+=DM*nkHdL?x_O2MN9Xl$5 z1yK~chGu>l_w7$!&e{Ljotd4TnVqxiKE73JEfT!u zjkB0;kUjHX9q>0SZ9y@O3*mAEk@4*);G)LSxm^EX4mX0%rw*l9v`cWuRX(kIe11<* z2^Mi>I-k$wHB=VY^Jmi|7@#th&x>P5Qi%|6=H_?t0M%h>B80KHZ4L0z3@(q!VS_hJ z7yS;~O%p!?|1=l<=u3R<(+Nkf3cxF2`GK!(Xey5pC19`v8U8E=JA@w!+B%p<8)9ui zGdDFSwk#|EVdlWrBoZt?zGabIPBeo}2WFvc(x?*N!Taz^KzS(s0kn$G3uj8VM{*m7 zYGZ2NG*BqxGCKlssTfQPQ`%rtOBtZJwiDV-<;W%jHG&=(%4Ca1M$BLve;X^Rj1IE_ z9YNcud@h3#z@T%3WIe=Xuo!fn=s9asnzgC93C-HlB!t0caG8O!D)3@?d`1Msoxo%Z zVq`sn>2@EQjirfKfPl>xP&{4TDU(A5lo5guiYAQvKFJEz6sezn`NG^lN^vRAvx(5@PYNVoT+P3W9@K5SoS~L=ZnVQY7`Wn6w7^Q-hf-mVY#zC4e{B z0v4;G;o$cM&SQS$aDy1IBh*nybzUf41Szlo3g*MbRS7G*0x}~-irnTZ z2Ph4Th75`W-ldUGgB!5GhoG%H2b6(F&HkaBKHrKo_o9wQYQlW?dQG;=ayr|r^vmDR zyWWeaPYURLw!|~~dGXsX?s3`0W8RPK{LNx|+iUj69hN(%y`0k_@7NVVWbEeHZri?$ z>or_pVsPAbP1&on#e3H3&k}^F?!RuFp!+hit#j}?k3vBk(&Y9UZLYVP;dFSj;{%OC z_jtVItY3C+_HZCIuyg^KAwkw~zB|DnVdNsF*E zi3AJ03#Fx6T`Vm#gq<8Fl5)0z*}cn7LuQB6q7Y}@JJji=I|E$VBd1m8AMID5wb`kD z=jt;X`}}FLzJxY@_R};KJ(V^pT~vCjEX|7Taek!Dw9JSPol19Vns4X%Z@05wTE134 zGb&umG(WU${-jQf?W_ULC6qBYZ{0lPZfC-Ivt*;zq2d*Pxw$R3v%PwE@z@f#kgbO{ zrR2Vb32~5Y2f`g_qzmzB=xgA}Ni|8n(PVLLPP#IBU;;JpPKR;_$^#p!V+Wvs2X!Tr6O6_phIV;la^V`uM*(p<7+ zgWrw&Z=U3Samv4{pU37>PQL8#%`vRnFjm*L*7NC=(aDyY+96#{YiK;zvI)7PEZ^q` zkDO$+f_?jnZkXkWl~;|o+GH@2Gz^T!I62t{?6wWi9nvRmPLG4_A1-pqcfOs*aH6U9 z>2$p6@t{e)hCLmYTl2ZMZ*pf1VS?J3FOS!xzx27aep%+GPMODEo>^>Z;F$aT;;j!G zJBMB$arnOZn)0O7AR8yEWAvV@p4}}vqCfQVvsuTv+leKhz_KbE7PQ5*XLi$@*LJj@+3+mdvPeQWKP{b`Xl_U$I}20S_C zMSDp0TYi&qqO;9K&A}d#_vV_Lx$~Kp9*Nnc3q$mlTcvzd&H5xP2sC>YKC7?W*T9$R zVF!b^F1oHgF7GL!W9#9=O zt$So4)AE4!^0V9;^1xQhR_K-Z^c&soz98Yl^H(lQ9v!_&?(gj!Xq&b8O4cGBpZ2t( zoUVEm71cQ|!uG58Wb~MC)XRZiYBwWG&~=M&sISpI>x5O~u9aMBeI{m;=9FjCTpo`L zeXXaa^Zb?b*^=cKk9J(NN6)R)YDrdTfbZq1o4g0Sldqns`_&%b+i~Rn=~ZEGa)J+f z#AaGNGE}V!+fgyCaNCrptH!1n`3j9Rd`o+qweg*2eB#<;-`PPq7q;cpe?D2W(_24P zz~1`y#GORX+@cwR*o3o{uT%;@Uacu!rCPgwp3sSM5F)Q{W$*SN1yp_^pfH!2}v6d zb+vO9?7mSh6b9{@6q3GU>qR#QhiOH;9QDhCgbRvq-62HUi)W))?A^VK%1l-q07TUp6inV&UjqN*Fl~!rh zD$TP4%k8flp6YAeN}M*CUgQtGGqW9ub!5$&Nj_@M5&QIxPPVQOt0@lMb^PX}j1SJE zE-ek{N~?cyIkeBoo1<)lNC2F5aJ>^!Dxafo8h2b2?`Ji8HUP>F!VC9WFPIch~pN(@dICLpr}L=JYJrQ+_%B zIX>%zeVFmAV^)j$7Cb$#x#CD=N%|=F%PG&=Epbb#@H6r%(209r?_M7fv&-Dk>&2$7 zqgp3c_kwa^9@>>Dex`{rb&Zre5B+ zjYpnK&9cd{FR%Ia8SR!kqH^Z2QI z)lxt2j@(^e{U_<1W!+|-s?|DYpLL@Z)^x0XcJQ6*wmG-HlFz;FUHJIkwyYJYYtE(2 zY?WN<>YY`jJx@D-Sl{;x)Dj)aZQ7nNtoF<9wtHG`w>xp8Yf|E8Jj$#Ryl~}JJKwDx zx#(!!urt*|cYe%R9C!Vd-{GQTBmO+L_rl6GXYY--dF6ZNc!ihyr_cRK@fsUrJ8pd( z*S;X;yk5x~($dS`iFg0hUUEIdmXuQWdfC!&Ge6JhZKaoev)^PLG48Sduzve_K7sV( z`9t^Qo~n0$RaCThO@+WK$?2*QV|SMWSv%|dpUq0oyy%jXo?(}}Ebi$Gy&|##GkT@!o9)y1 zV}GU(Wwe=N`PAU!l+Ltv)L4c(N$c5C>&Fu*dVS3H+KzU#oW3%5{{>UclG1Lvy%!ww zx=VFDp|O`TA!%R$CWvXvB-W$w`xw?>;`d@=*fc>-9dXi?Nl0y)_

mJZ=Vz_CzFB4eW_#aqyPW~K(RZ8|>|QbORl=0yg0-Kg9G=2S z{*-Y{{{gdh=p}ZZtF1pP_eyuOCy8|jM)e)`exv&8S*CvH+D=Hyo!_rZxG*oOsC-20 zxyfewXKWU$<#FD)=dwGHYHGioi20{%Bc=AxSi28dgEZ#c-AQxl=O}FLLA4m`k=l1D zSz|-rh_;V=b=J-__rLly_Rf;z+K+9f>>XopcZNgOxt@ley|>QG*`5Ec{jU4YZqw`@ z1pA~OoMHTz-V2KNLG9Dyx6b@F!!~eO*@rJjyx8BS4L==_J8^Vb_SC#l2hAS#WN1%* zI?=;EXyTl0%m40Du)-*~iaqztzT5sEPCnTF;>p6Zbxwvi4Ci-%iBp{WCl{?$6j)n$=!->l_;v*90aZlO4p2W(EmB zTnHHGN`%O{*|vWp5g1(pP%w)g(geT$moWAPKQ07=@4o^||Kj4(8_4Upi?t@5|C=j67Px}=dd;8$QNbKvoht3+7UK%2B3i$Rq z*QEV+;0nXiM-IvTIl;j^246Adt$zjgSeX%n#v{d%S{O}+7nRPF1VJTu;#=S*8X%Rv zCyuoSF<5*$8ax*VnvnovQ^KQoN-N-TD}MiW-2iW4F;Ku`(I`~~55WQ}h%Wpv^_aWz zHZn|Mlny&gL9`#NCNx324*nHnl8!QMSm$jE9jWQr_`i#VqSZo&c|{o9sj{J3#yf?$^!iJUcbD)Lvyt&moJJeQC&FUs;jRKMyu@hd?p}h#+KF&VL{Ba7 z694-xX;~2{pkU1ZP3`V^5e!gRx)TPfgzSKSY2>|N?uG?E2@R_Qpunj6jJ+glRnnp4 zUOI^pty(X?oNZM*m9lPb=vWu7XR`Y>#}KmT?rEhfOTTQ|;(c#ZJ@?tn%B$>kx4v<* z_gpT|3qSpB@9qbtJ91rjCD$^vHu(cTN^e5Cqp5#kj`yi0;|_KiTkY6-!Dueogz2k1q^i`?!AC!3L z^w+LyzqZ@D(MS9FJ>F4EyvX^UT)kVKRf6;#5n+W(srb3_=af5pfko5p!6sNGD4w)c zBIDlw!YUbX_p`AckW^TRRtb^EP)(`81q4`N1rjN|;Xl0KOv%%{f~?~b%~=U(Q`ZP> zZGLkbg+KrW3TxrReL6mXGqA{IXans<^UehDAS|##Je3|4#DqyGbe21p6UpH6MTt&) zu7F_-@*hB66Llyxj0X}~jHw`dO7cK_f-zNWi&HrPVT?dNm5?ll4JmX4Z-Ti!N(K|& zY{|ZJNHg1QiHVnB%n!53@ootq&K5Fw&l0Oa0V#6zIkO79W z3`1@ymVwl_R`^kdwUeN8As%Ijg`}8a1k2#}8)O6mC>Uk?&Wu=sm_}Ws$t}YgiU67Q zAivd^8Vxcf!5<0n)c^7)Jmaz9A84fE8Q-jYs#%igYZKB_o8VKSc+x(VjC=nJpGrIJ zf1E zZs+&)Je7OnylYRJh2&Nw>jjIdoL7zvAjE8leikDrL|H#;+ggJqi0ubwHZcT?5!{5e zr(a^#kNK0V`e6wTy&*B7xy+Ez7D_ApG(oBL)siEJ+aQrlI7r2m7DwR~n_0hxP$QpE zeV=L8o{8W|PqQ-aeWz(wDW;BD8!`{kQ|Ryk-<7d~enXxr`DKq%lhC6Cjkz1E#iObV-ZT0sQgrKUZR%4w#QBgW@fo z$4{4!idQO1cpmzVvsfAT111;nn0#e}%)PS%=OA7#3@L~qB01n+kbTr#Y%?VS%}rSP zvX=y|WyR!>cS-L3FJf|7%?*PWVz$B^ws?Flo#oF1TO2hLOYo-*81J!p(+s}Ho`K6`SbX>{^xWc(EN)gxNA+A@K8L-r1t;t zRUe+xN?29UF%*wWB6k%HNNhY;w5^jsycM958PeF8036p55*XO5=3-R+Bq0I$#BPZ$ ze)rHhHvWQ_?3VD$cPBPhvPJ6@5;gMMW=#CZ*do>%>xI+WV1rI(G$W|RYx= z-eR9MAHRvBn9Yg|HSAc0UqGT37Gm->4=d}Z3~uz6N${2t+I&W+Xt76PZ6HGm;jp5? z`Qh535F}e(j;u5_W8UJo-(;3Z0R?_AOV>MRAsC;q$l)S3YyJv&3>Nq}OvvScBFdV- zV%>^V)3!UznvcjlFnaE3R`h|}twwlktd8iqYH7P$12ZP*pIjJoEq#4XOjaq#n(uf! zYz13=V7hvhVaQ=$$_9NK)2u1+gLUo=50gpJZ|{`!!Jlm1Rfv514wqT5K!`vqo+GS={N^9M69}L{`d)tAo;v~%8y4su z$|Sl-ebB&NSm2ZKRFEkV$Yn-~PhFsn7Qips>1@h~2zYn|oy`MZxr?{!r;hgWAb!#w z{F=HCS;B(59LRbpqPi4@_kM|!2AV~>CZ5s?1gkLUo;6Icgarm~8{@G^zTnXYu)qo= z8vbi8x3`+lkoifjz%+G+qS^imJF5IhJCaBN1+1*^!?#!pm;#GjR$}L<{op}ZUzaC<9P{PTJ7l)SOA$6!ESjk#hZre2Q(=7>5kW9OFscUDY9k^QPgVGE z`A@nq@dLMzQz}amn`Z;pYS9y@cxDosH+jOZtXwSVjC3k)F$l;DE!_S_uCG zR$n@WGru5TltLpmkORb<$p*=R(1>jYr}F*CTzaKE*fti=CQ|@~VS%~2SV}|nrUrMy zfrSDJ<{Efz`ic?mf~1<9yyCvN&~$D!UzxbB{q+Ev?N-o9Bk<#b@E|@NE1rzaBY6AuAI>(_(rt1A_4XPo$A>>h~lLpkIUl4lSAaQh+>VX#}q;oEToXv^h?gu3q+@a#Mcm{V1yt$RIG|M9uv_-wW!+mI z6h5L$lq=$Xy=dEtL0XE;<ldA{16N!Kog(gygUR$S!1Z=4j&Hx@gsXlB#e%kf%+F zTgIaGyAMKMEvhd?P^>z{>#;~l%|;a=ai>%C)n$grtL3Ob;+C0cRb3;bDjM;Z*LSQY z7KQyAV1W8FmVqKg0r{=CBB_tcfxK1__oGDnVrPo90)rye3antmB{bv`l4ncmONOkR zaYo!=5yg0ChO|L~A=Lt`dW7+4qx|r`1wh0biO3n=*qtZvBHGOwsfyk3gQ!`s_3?xB4;!RDZVL+{ z0C58@lmIVm5deI0DG>qds^goGHv~bz?fpaqzy?zB0zSYF0gwo%9J#{A2Js@g(w>N+ zU{1mYiGkfGxrAiyFnLf?B8t`cc)lJ7AefJMn;;kXMv#W%>Niz5V7f4J3CV0n97v-Q zg5@v1DGx^AP_WtcL>^;7@V52%Q8*C%mK2S_;M;Q22?vASX@LlWwXg9YgPd_7jm@iA zp@nHR$R#AN8G{2+Mso4Cr}|i#KuU-&-g|g6u2H;3Wf*U@*p5fOr;#JR#E*zk>cmN^ z5vI$FJ{E(jHio9OV_p3xGtagBbT;vjx&%4QiQi96BMUY%V@@{tph*EUMb`-Dw z>LggoND-b`K^{nZ5DX={h94dyC(DD9>JwJ~iMGjVpb#*<00^3#D@T5?>H<%a>$vLJ zucwGimycmT1>1zi)D~7%;M=jw8;jAHrogrXuZ6o$#bPu*%VLEDev5!yLh^!XSQMFG zCJ;%39S<5aij^^celi!@D0z9t)}aWj^fL7}Ak%4cD1WAg_#7saG!y#BUh5`W~rk$aHy+a!Z zlxiC>=?sNp%Y$lM2#9%9DBv5WoVF=121mjG6eb@@jsnMyU;KRZ4*>wGPFzA$W6BCY zc}KDEZIM7JU?SGS$4}l<*a{G(xQOj2UQul)VBvE@8tWxqQE5fu+a$(OqKkM%^(m?` zIH~?&Cvv!uj$A^rBUcm!!!D~6JB6M4@T^;asodCddSi%qYnjAHnOh(p--N@^RM77@Oswse?;k-Zf?U)M{oidL3UKRXAP$t>uhgoA0Rf+TYhoZ>W8($CY*s zR(d^rQoF*DTS^nCc0K>@Ui!x#EJLWe_4@1oH}!gXL@T|{e|i1u{M(gY=U?l!{nBQm z`)7LTdzW+mI)Bn@{^ix{{H;}bwG2^Sz4pIT>D8s&dhP%4-;7`9?>nV$q!L};_;vkm zDt${Yy|!Og{hHP*9HB1gU(Whz|NctYL^;y>^7`-Tr5~#Ne_mSPs$JLbd8OB%U+vqM zA@tHGD7{`k_o|=pH~s7N^XA{w>-=Xbz5cw{@$3AQmJQwS_L_f$%Af8pO|R>_+AIFv zN?+FJhmL=n((CVUnM$ww%kqENOTS9#ca$bn?RtI}yyic)L*W43Uz!!@t(|Q+kyKWR~;a zAv%7u((Ct!)@%Evg`xY$E4^MnIrr#)j?(MzuUfC|m*(GBb-bhW`uoFj<-aVw?%(JY z|Ehb8f4^7!TCeSw7QgPlp!E9v`QbhKzwQ;k)@%D^`LCrGzOLWaa{AZ#Z>{t?{_^T| zTtP}-O*Lq}exK@brClFVdi}mQpyF>+nn1N%V&g52&M~8`39*H@36{8&!n;IEiaFMt zV!rz+Iw>yBVeuLg?T8s=NmL_~&321T4X|6|N2SCi*%Qqvoo(hsi@F_cu{*3uHmfbx zGM4T=9-^%_hsB=4*61XAti^s;pWTv}G}@d{)MFiOjZHNtI6B8AnByHBTGWt`6l3-n zDXC3`Uc5`sv81-Y7_R19Tva&hyJAsI%8quWugXTEHQrtjv7>XW#gS6%(Nl?Hk6Kqk zllE}Olpg$NC$^+mOED7#1uFLJNJ_QG6d#acO|&GYrWA!}w~n^NMmyM0TCI#kD9WTPPLqNl>l-m&Mf7LUdgvS++n9)-7eYE}5GL%91%1?1IOxVj4)larED z6AH3X`0R0Zr_Iq>+byY0Sue4x&_}X8X|%;=w#6`byCsG@;m`? z8kN9dc{$ZW%h3rITYSnWPtF{j3tXkxxwi^RT4JNt7YOwH3P-P%9BVOkH3db61O^0} z{7vzer0AF==Frr2ct}`4SZGA3f2Z)^fS{1zkSKrCofQ%7c{NelB_t#)GAOhQwS13w zOa(@C2`{kau#Dw%Fz?gvaurd6tI&YRER3tmEWxZ(XtUH#nnXkPY+Lri49+Rom+0O3PL_u(}08Cz~L;*Fe~)mY6ry7t{MShHFAQtUcw2FSgai zD|OqV&d%D%Ycl}(H(tdbH7DaCITc-O)3K=EOl0nzgV(3e#iv!?f~(tG$mo*+dsYS> z{v!jGljdRS)OmQa_1jqSLR=xxET3Si*e|; z#YjwBg27vsU`pN+r2oDI`Hh#Nao|#Xd37mLGnQfD+sn}H^JS0+mf@FM%V74&f~!In zTGYxy*QhM)e>n>SbF*;byDU6dB^z~ov*GwxHY!JFWAfN+^ct6q#jj-}-N;7NmTX+w zmyM43*=SiU2MvO9(6VO^PDbTmn>7cOZ8_MSnu8WIaxj2w&&Ff;aBp@2p8T+~3_M;l_FryvU6K(v8V0+}QuF8w*ytadnLw>(;q3Ws@6yHoFnO!;SH~ z+~{)9jX}rVNIyY4C*2tMtsCE*q3)s^=kwk8;WsyGU3X*LZ8s`bFfg@-fkSl-oM>s_ z;Qa>NZ4At8Z(vwQ1Fr@ds2ys+zpH^R4;eV}uz@*`7L`nnQRvPHJ&Opue27V-Y8);*!ffn0&zt2G30|s+%pk1B;`(Xp`kppD^ z5d-y(88}TEouuz?InUFaBdL9cZDb<(=sWs6Yryvc=RrRB(SW#YVB_z!34v`s0+lNY z+*e5;psK(xz5*{)7x;+Os3EYvrob0uOFe;l_4!O@H5NEQ+BOyNX(rJ5et~nOQJ}!5 zp#rx^>j;7LNP$6J1#%t{_=e9VCYY#*>A__)_2l@_wGc>?6$oQQ9L7j|uc5o5+ac z0z1iDCk3)d)G2{IWXiX!-)Vu0XXuMe{a)ZQ2|ml*o)d7At)$}*0;kDs;&(ydTM~GY zwIPiz3$(u?FoY~3`+jCE^O^r&1mb_=*qhAJE!K?OBnf{|b6em^@{W(hBlk&cC#xz- z945Y%ByN!>s!9m5!&l<*Y7*B;qUtu5>KX)3NJDDqzidGfwsu^q<$h}CbP*o@_|j_V=^g;^-7i) zL_Q{8lYhRBnpX@~SC*(5Ae;yDsSnl7PDGMVI)R!dnkvYP~Cah+##o#jY8 zNyd}8PKj%zo{PT7J7g!h=H@%Wkob}K3Fd`tBD=}IbGh%AvwsEaLXMM@#Pom65&8Zd ziLvj|_WKgmR+3fRrzHFXiS*SH7fIDMd{2<0YdOz#tjmYY>jv(7l1gT89=l{wf(U&L=a-&f>Aa*>^*1 zmU|_Rk@=tT9ZZgqGvp!}`Z@Qhhkbk(6BpU=Z|*zt(|+#P1B{CX?C3MfQ12TuW z$v$%CH0Mi}ong&L)OWN^W|9@;%vr84(&-%c5jjd8I?s0l`I2<_LE;ip}5Hg5l zknN=MkKFeplhpf(H6UBbEmGs6gqe&cuagY2hIG3uF@j7Z^T=w_>WajRWG$(5m7h;! zKG{SzlRR>r`28&LEEz}Eko9CxK66Myeqr3?Su&ffA>R?7U%5`mOtOTmC0ocf(&abS zg}h2Ok%OedHP(*2LKc#|>zv!~d@qov$zI}ngZq5J!w%j7m4I?-(0**){(2ES+(4Pf2WsrUJ%FA;bI2{4xS(INWkmJ zz+|_0PNFT*o};|zM02vKYnsV(l*j{zG}B0{&1@gntKjgOrwUf8)d7PoHF2cHZn|^g zV7A-M<4oynu=9}3YPWcfoc?T`XC@^aQ54L>`#))adDdY{w0R=r5oe6un(R4NV}Q>% zcv^31#e=pK9-k-KTP0e|wpRL(w3X-Jt)wZ8wwJ@yXtUkwIVx;rEz{zitQ0?o@%#?s zjs9(rM=eMzm=cej{7s&Kc*Aox&n*8&lg_2*Cn3-OMiWW&@4)}Fyn2?*|1PJ0VXfqR z>N%G8>b+Qcrh3lKxIVfkaFB`o8*pj+tn72SyH{zN+5y8& z+>%yXyvJ*5LV_uZ(=2qaN9Y0GyQ*N6|6Nwgx3AiNGJIZl?|j8uY!>zVOF>txIcQ{9 kc#OGIP*`kWr^t}Vz)mrd5uu#|10%!ZB0>Y>f&#<;3m)32MF0Q* literal 0 HcmV?d00001 diff --git a/tests/test_compression.py b/tests/test_compression.py index f26520e..2836d58 100644 --- a/tests/test_compression.py +++ b/tests/test_compression.py @@ -355,6 +355,104 @@ def test_wrong_format(self): with self.assertRaises(exceptions.FileFormatError): tszip.decompress(self.path) + def test_struct_metadata_roundtrip(self): + ts = msprime.simulate(10, random_seed=1) + + struct_metadata = { + "reverse_node_map": [847973, 1442881, 356055, 2542708, 285222, 175110] + } + + tables = ts.dump_tables() + schema = { + "codec": "struct", + "type": "object", + "properties": { + "reverse_node_map": { + "type": "array", + "items": { + "type": "integer", + "binaryFormat": "I", + }, # unsigned 32-bit int + } + }, + } + tables.metadata_schema = tskit.MetadataSchema(schema) + tables.metadata = struct_metadata + ts_with_metadata = tables.tree_sequence() + tszip.compress(ts_with_metadata, self.path) + ts_decompressed = tszip.decompress(self.path) + self.assertEqual(ts_decompressed.metadata, ts_with_metadata.metadata) + + def test_utf8_time_units_roundtrip(self): + """Test that time_units with non-ASCII UTF-8 characters work correctly.""" + ts = msprime.simulate(10, random_seed=1) + tables = ts.dump_tables() + # Use time_units with characters that require multi-byte UTF-8 encoding (>127) + tables.time_units = "μβrånches per γενεᾱ 世代" # Greek, Nordic, Chinese chars + ts_with_unicode_units = tables.tree_sequence() + + tszip.compress(ts_with_unicode_units, self.path) + ts_decompressed = tszip.decompress(self.path) + self.assertEqual(ts_decompressed.time_units, ts_with_unicode_units.time_units) + + def test_json_metadata_roundtrip(self): + ts = msprime.simulate(10, random_seed=1) + + json_metadata = { + "description": "Test tree sequence with JSON metadata", + "sample_count": 10, + "parameters": { + "Ne": 1000, + "mutation_rate": 1e-8, + "recombination_rate": 1e-8, + }, + "tags": ["test", "simulation", "msprime"], + "version": 1.0, + "unicode_text": "Héllo Wørld! 你好世界 🧬🌳", # Characters with ASCII > 127 + "author": "José María González-Pérez", # Accented characters + } + + tables = ts.dump_tables() + schema = { + "codec": "json", + "type": "object", + "properties": { + "description": {"type": "string"}, + "sample_count": {"type": "integer"}, + "parameters": { + "type": "object", + "properties": { + "Ne": {"type": "number"}, + "mutation_rate": {"type": "number"}, + "recombination_rate": {"type": "number"}, + }, + }, + "tags": {"type": "array", "items": {"type": "string"}}, + "version": {"type": "number"}, + "unicode_text": {"type": "string"}, + "author": {"type": "string"}, + }, + } + tables.metadata_schema = tskit.MetadataSchema(schema) + tables.metadata = json_metadata + ts_with_metadata = tables.tree_sequence() + tszip.compress(ts_with_metadata, self.path) + ts_decompressed = tszip.decompress(self.path) + self.assertEqual(ts_decompressed.metadata, json_metadata) + self.assertEqual( + ts_decompressed.metadata_schema, ts_with_metadata.metadata_schema + ) + + def test_raw_metadata_with_high_bytes(self): + ts = msprime.simulate(10, random_seed=1) + tables = ts.dump_tables() + raw_metadata_bytes = bytes([65, 66, 200, 150, 255, 128]) # Contains bytes > 127 + tables.metadata = raw_metadata_bytes + ts_with_metadata = tables.tree_sequence() + tszip.compress(ts_with_metadata, self.path) + ts_decompressed = tszip.decompress(self.path) + self.assertEqual(ts_decompressed.metadata, raw_metadata_bytes) + class TestFileErrors(unittest.TestCase): """ @@ -411,3 +509,20 @@ def test_open_both(self): ts = tszip.load(files / "1.0.0.trees.tsz") ts2 = tszip.load(files / "1.0.0.trees") assert ts == ts2 + + def test_issue95_metadata_dtype_regression(self): + # Test that we can decompress files with struct metadata that were compressed by + # version <=0.2.5 that stored metadata as the wrong dtype. + + files = pathlib.Path(__file__).parent / "files" + + ts_original = tszip.load(files / "issue95_metadata_dtype.trees") + # This file was compressed with 0.2.5 and should now decompress successfully + ts_decompressed = tszip.load(files / "issue95_metadata_bug.tsz") + + assert ts_decompressed.metadata == ts_original.metadata + assert isinstance(ts_decompressed.metadata, dict) + assert "reverse_node_map" in ts_decompressed.metadata + assert len(ts_decompressed.metadata["reverse_node_map"]) == len( + ts_original.metadata["reverse_node_map"] + ) diff --git a/tszip/compression.py b/tszip/compression.py index e34806c..380b65f 100644 --- a/tszip/compression.py +++ b/tszip/compression.py @@ -215,9 +215,9 @@ def compress_zarr(ts, root, variants_only=False): "reference_sequence/data", "reference_sequence/url", ]: - columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8) + columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.uint8) if name.endswith("metadata"): - columns[name] = np.frombuffer(columns[name], np.int8) + columns[name] = np.frombuffer(columns[name], np.uint8) # Some columns benefit from being quantised coordinates = np.unique( @@ -335,7 +335,13 @@ def decompress_zarr(root): if key.endswith("metadata_schema") or key == "time_units": dict_repr[key] = bytes(value).decode("utf-8") elif key.endswith("metadata"): - dict_repr[key] = bytes(value) + # Handle backward compatibility: <=0.2.5 versions stored metadata as int8 + # which can have negative values outside the valid byte range (0-255) + try: + dict_repr[key] = bytes(value) + except ValueError: + uint8_value = np.array(value, dtype=np.int8).astype(np.uint8) + dict_repr[key] = bytes(uint8_value) else: dict_repr[key] = value return tskit.TableCollection.fromdict(dict_repr).tree_sequence()