diff --git a/src/examples/lin1.c b/src/examples/lin1.c index 99efa4d..208eba7 100644 --- a/src/examples/lin1.c +++ b/src/examples/lin1.c @@ -26,7 +26,7 @@ THIS SOFTWARE. #include "asl.h" -main(int argc, char **argv) +int main(int argc, char **argv) { FILE *nl; int i; diff --git a/src/examples/lin2.c b/src/examples/lin2.c index c49767f..87857d1 100644 --- a/src/examples/lin2.c +++ b/src/examples/lin2.c @@ -26,7 +26,7 @@ THIS SOFTWARE. #include "asl.h" -main(int argc, char **argv) +int main(int argc, char **argv) { FILE *nl; int i, j, je; diff --git a/src/examples/lin3.c b/src/examples/lin3.c index 257c292..0663b84 100644 --- a/src/examples/lin3.c +++ b/src/examples/lin3.c @@ -26,7 +26,7 @@ THIS SOFTWARE. #include "asl.h" -main(int argc, char **argv) +int main(int argc, char **argv) { FILE *nl; int i, j, je; diff --git a/src/solvers/asldate.c b/src/solvers/asldate.c index 5f6da21..6a530b8 100644 --- a/src/solvers/asldate.c +++ b/src/solvers/asldate.c @@ -1 +1 @@ -long ASLdate_ASL = 20231111; +long ASLdate_ASL = 20241111; diff --git a/src/solvers/changes b/src/solvers/changes index f107bc6..6bea407 100644 --- a/src/solvers/changes +++ b/src/solvers/changes @@ -3349,3 +3349,87 @@ source file jac2dim.c. solvers2.tgz: minor tweaks as in solvers.tgz, plus a bug fix to pfghread.c that affected Hessians of imported functions having several arguments involving variables. + +20240106 + solvers2.tgz: fix a bug with constant terms that seemingly involve +variables, such as (v-v)^2; the terms were ignored instead of +contributing to the objective or constraint in which they appeared, +which mattered when the terms were nonzero, such as cos(v-v)^2. + +20240527 + jacdim.c: jac2dim_ should invoke jacpdim_ASL rather than (the +unavailable) jac2dim_ASL. + mpec_adj.c: fix a bug with complementarities involving double +inequalities on a linear expression. + +20240528 + pfghread.c in solvers2.tgz: fix a bug (e.g., fault) seen in +some Hessian computations or Hessian-vector products. + +0240601 + mpec_adj.c in solvers2.tgz: bug fix affecting ASL_cc_simplify. + +20240603 + sphes.c in solvers2.tgz: fix a glitch with -DALLOW_OPENMP. + +20240618 + solvers2.tgz: fix various bugs related to parallel evaluations. +Now we assume pthreads if neither ALLOW_OPENMP nor NO_MBLK_LOCK is +#defined. Add the possibility of computing Hessian-vector products in +parallel using at most asl->P.hesvecth processors. + +20240622 + solvers2.tgz: fix various more bugs related to parallel +evaluations. New int asl->P.thusedsetup records the number of threads +used in the last call on sphes_setup (which is called automatically if +needed and not yet called). This joins asl->P.thused for the number +of threads used in the most recent sphes call and asl->P.thusedhv for +the number of threads used in the most recent Hessian-vector product. +These numbers can be requested to be printed when they change by +setting bits 16, 32, or 64 of asl->P.sphopts for setup, sphes, and +(for 64) the various Hessian-vector product functions. + +20240926 + Fix a bug in obj_adj.c that only matters when suf_sos is called +and the model is of the form + + ... + var v; + s.t. vdef: v = ...; + minimize obj: v; + +(If instead one simply declared "minimize obj: ...;" then the bug did +not bite.) This fix affects both solvers.tgz and solvers2.tgz. + +20241018 + Fix bugs in sphes.c and pshvprod.c of solvers2.tgz affecting +parallel Hessian and Hessian-vector computations. + +20241108 + solvers2.tgz: fix some bugs in parallel evaluations; new +possibilities for doing Hessian evaluation and/or sparsity +determination in parallel, controlled by new values in psinfo.h: + + int hesevalth; /* number of threads for sphes_ew_ASL */ + int hessetupth; /* number of threads for sphes_setup_ew_ASL */ + int hesvecth; /* number of threads for Hessian-vector products */ + int sph_opts; /* options affecting threaded sphes and sphes_setup */ + +Partially separable structure: Hessian is the sum of products + matrix-transpose * internal Hessian * matrix. + +Current sph_opts bits: + 1 = assume internal Hessians are dense + 2 = print dimensions internal Hessians + 4 = suppress parallel setup for Hessian evaluations + 8 = suppress parallel Hessian evaluations + 16 = 0x10 = print number of threads for setting up Hessian evaluations + 32 = 0x20 = print number of threads for doing Hessian evaluations + 64 = 0x40 = print number of threads used for Hessian-vector products + 128 = 0x80 = suppress parallel Hessian-vector products + 256 = 0x100 = allow parallel evaluation of internal Hessians + used in Hessian-vector products. The default is not + to do this in parallel because it slows evaluations + on a large example. + +Note that parallel Hessian evaluations are not available in solvers.tgz. diff --git a/src/solvers/dtoa.c b/src/solvers/dtoa.c index 20089ea..5ad2fa5 100644 --- a/src/solvers/dtoa.c +++ b/src/solvers/dtoa.c @@ -28,7 +28,8 @@ * does this with many compilers. Whether this or another call is * appropriate depends on the compiler; for this to work, it may be * necessary to #include "float.h" or another system-dependent header - * file. + * file. When needed, this call avoids double rounding, which can + * cause one bit errors, e.g., with strtod on 8.3e26 or 6.3876e-16. */ /* strtod for IEEE-, VAX-, and IBM-arithmetic machines. @@ -2716,13 +2717,14 @@ enum { /* rounding values: same as FLT_ROUNDS */ }; void -gethex( const char **sp, U *rvp, int rounding, int sign MTd) +gethex(const char **sp, U *rvp, int rounding, int sign MTd) { Bigint *b; + char d; const unsigned char *decpt, *s0, *s, *s1; Long e, e1; ULong L, lostbits, *x; - int big, denorm, esign, havedig, k, n, nbits, up, zret; + int big, denorm, esign, havedig, k, n, nb, nbits, nz, up, zret; #ifdef IBM int j; #endif @@ -2740,6 +2742,9 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) #endif #endif /*}}*/ }; +#ifdef IEEE_Arith + int check_denorm = 0; +#endif #ifdef USE_LOCALE int i; #ifdef NO_LOCALE_CACHE @@ -2891,7 +2896,7 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) k++; b = Balloc(k MTa); x = b->x; - n = 0; + havedig = n = nz = 0; L = 0; #ifdef USE_LOCALE for(i = 0; decimalpoint[i+1]; ++i); @@ -2906,22 +2911,28 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) if (*--s1 == '.') continue; #endif + if ((d = hexdig[*s1])) + havedig = 1; + else if (!havedig) { + e += 4; + continue; + } if (n == ULbits) { *x++ = L; L = 0; n = 0; } - L |= (hexdig[*s1] & 0x0f) << n; + L |= (d & 0x0f) << n; n += 4; } *x++ = L; b->wds = n = x - b->x; - n = ULbits*n - hi0bits(L); + nb = ULbits*n - hi0bits(L); nbits = Nbits; lostbits = 0; x = b->x; - if (n > nbits) { - n -= nbits; + if (nb > nbits) { + n = nb - nbits; if (any_on(b,n)) { lostbits = 1; k = n - 1; @@ -2934,8 +2945,8 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) rshift(b, n); e += n; } - else if (n < nbits) { - n = nbits - n; + else if (nb < nbits) { + n = nbits - nb; b = lshift(b, n MTa); e -= n; x = b->x; @@ -2990,12 +3001,49 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) return; } k = n - 1; +#ifdef IEEE_Arith + if (!k) { + switch(rounding) { + case Round_near: + if (((b->x[0] & 3) == 3) || (lostbits && (b->x[0] & 1))) { + multadd(b, 1, 1 MTa); + emin_check: + if (b->x[1] == (1 << (Exp_shift + 1))) { + rshift(b,1); + e = emin; + goto normal; + } + } + break; + case Round_up: + if (!sign && (lostbits || (b->x[0] & 1))) { + incr_denorm: + multadd(b, 1, 2 MTa); + check_denorm = 1; + lostbits = 0; + goto emin_check; + } + break; + case Round_down: + if (sign && (lostbits || (b->x[0] & 1))) + goto incr_denorm; + break; + } + } +#endif if (lostbits) lostbits = 1; else if (k > 0) lostbits = any_on(b,k); +#ifdef IEEE_Arith + else if (check_denorm) + goto no_lostbits; +#endif if (x[k>>kshift] & 1 << (k & kmask)) lostbits |= 2; +#ifdef IEEE_Arith + no_lostbits: +#endif nbits -= n; rshift(b,n); e = emin; @@ -3020,16 +3068,9 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) k = b->wds; b = increment(b MTa); x = b->x; - if (denorm) { -#if 0 - if (nbits == Nbits - 1 - && x[nbits >> kshift] & 1 << (nbits & kmask)) - denorm = 0; /* not currently used */ -#endif - } - else if (b->wds > k + if (!denorm && (b->wds > k || ((n = nbits & kmask) !=0 - && hi0bits(x[k-1]) < 32-n)) { + && hi0bits(x[k-1]) < 32-n))) { rshift(b,1); if (++e > Emax) goto ovfl; @@ -3039,8 +3080,10 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) #ifdef IEEE_Arith if (denorm) word0(rvp) = b->wds > 1 ? b->x[1] & ~0x100000 : 0; - else + else { + normal: word0(rvp) = (b->x[1] & ~0x100000) | ((e + 0x3ff + 52) << 20); + } word1(rvp) = b->x[0]; #endif #ifdef IBM @@ -3617,10 +3660,11 @@ strtod(const char *s00, char **se) c = *++s; if (c > '0' && c <= '9') { L = c - '0'; - s1 = s; - while((c = *++s) >= '0' && c <= '9') - L = 10*L + c - '0'; - if (s - s1 > 8 || L > 19999) + while((c = *++s) >= '0' && c <= '9') { + if (L <= 19999) + L = 10*L + c - '0'; + } + if (L > 19999) /* Avoid confusion from exponents * so large that e might overflow. */ @@ -5416,7 +5460,8 @@ dtoa_r(double dd, int mode, int ndigits, int *decpt, int *sign, char **rve, char && (spec_case ? 4*res <= ulp : (2*res < ulp || dig & 1))) { ulp_reached: if (ures < res - || (ures == res && dig & 1)) + || (ures == res && dig & 1) + || (dig == 9 && 2*ures <= ulp)) goto Roundup; goto retc; } @@ -5662,7 +5707,7 @@ dtoa_r(double dd, int mode, int ndigits, int *decpt, int *sign, char **rve, char } } if (!(zb & ures) && (ures-rb) << (1 - eulp) < ulp) { - if ((ures + rb) << (1 - eulp) < ulp) + if ((ures + rb) << (2 - eulp) < ulp) goto Roundup; goto Fast_failed1; } diff --git a/src/solvers/fpinit.c b/src/solvers/fpinit.c index e9063d7..edea0c4 100644 --- a/src/solvers/fpinit.c +++ b/src/solvers/fpinit.c @@ -49,6 +49,7 @@ int isatty_ASL; /* for use with "sw" under NT */ #undef WIN32 #define WIN32 #else +#define _GNU_SOURCE #include "fenv.h" #endif diff --git a/src/solvers/getstub.c b/src/solvers/getstub.c index 5971f16..1b423bc 100644 --- a/src/solvers/getstub.c +++ b/src/solvers/getstub.c @@ -616,6 +616,7 @@ badopt_ASL(Option_Info *oi) { oi->n_badopts++; oi->option_echo &= ~ASL_OI_echothis; + oi->need_nl = 0; } char * diff --git a/src/solvers/jac2dim.c b/src/solvers/jac2dim.c index 9416c74..814ebab 100644 --- a/src/solvers/jac2dim.c +++ b/src/solvers/jac2dim.c @@ -41,5 +41,5 @@ jac2dim_(const char *stub, fint *M, fint *N, fint *NO, fint *NZ, if (cur_ASL) return already_ASL("jacdim"); asl = ASL_alloc(ASL_read_pfgh); - return jac2dim_ASL(asl, stub, M, N, NO, NZ, MXROW, MXCOL, stub_len); + return jacpdim_ASL(asl, stub, M, N, NO, NZ, MXROW, MXCOL, stub_len); } diff --git a/src/solvers/mpec_adj.c b/src/solvers/mpec_adj.c index 006e961..470442c 100644 --- a/src/solvers/mpec_adj.c +++ b/src/solvers/mpec_adj.c @@ -51,6 +51,33 @@ reverse(int *a, int *b) } } + static real +rhsconst(ASL *asl, int i) +{ + expr_n *e; + int j; + + if (i < nlc) + return 0.; /* constraint is nonlinear */ + switch(asl->i.ASLtype) { + case 1: + case 2: + case 4: + e = (expr_n*)((ASL_fg*)asl)->I.con_de_[i].e; + break; + case 3: + e = (expr_n*)((ASL_fgh*)asl)->I.con2_de_[i].e; + break; + case 5: + e = (expr_n*)((ASL_pfgh*)asl)->I.con2_de_[i].e; + break; + default: + fprintf(Stderr, "Unexpected ASLtype in rhsconst\n"); + exit(1); + } + return e->v; + } + void mpec_adjust_ASL(ASL *asl) { @@ -211,7 +238,7 @@ mpec_adjust_ASL(ASL *asl) /* v3 - v4 = body, v3 >= 0, v4 >= 0, */ /* v1 complements v3, v2 complements v4 */ - *Lc = *Uc = 0.; + *Lc = *Uc = -rhsconst(asl, i); *ind1++ = v1 = n1++; *ind1++ = v2 = n1++; *ind2++ = v3 = n1++; @@ -220,18 +247,20 @@ mpec_adjust_ASL(ASL *asl) vset(Lv1, 0.); vset(Uv1, Infinity); } - ncg[1].varno = n2+3; + /* v3 - v4 = _con[i].body */ + ncg[1].varno = n2+3 /* v4 */; ncg[1].coef = 1.; ncg[1].next = 0; - ncg[0].varno = n2+2; + ncg[0].varno = n2+2 /* v3 */; ncg[0].coef = -1.; ncg[0].next = &ncg[1]; *pcg = ncg; ncg += 2; - ncg[1].varno = n2; + /* v1 = v - L */ + ncg[1].varno = n2 /* v1 */; ncg[1].coef = -1.; ncg[1].next = 0; - ncg[0].varno = j; + ncg[0].varno = j /* v */; ncg[0].coef = 1.; ncg[0].next = &ncg[1]; *Lc1 = *Uc1 = *Lv; @@ -239,7 +268,8 @@ mpec_adjust_ASL(ASL *asl) Uc1 += incc; *Cgrd1++ = ncg; ncg += 2; - ncg[1].varno = n2+1; + /* v2 = U - v 0 */ + ncg[1].varno = n2+1 /* v2 */; ncg[1].coef = 1.; ncg[1].next = 0; ncg[0].varno = j; diff --git a/src/solvers/obj_adj.c b/src/solvers/obj_adj.c index 3744c8a..24e874d 100644 --- a/src/solvers/obj_adj.c +++ b/src/solvers/obj_adj.c @@ -1,5 +1,5 @@ /******************************************************************* -Copyright (C) 2019, 2020 AMPL Optimization, Inc.; written by David M. Gay. +Copyright (C) 2019, 2020, 2024 AMPL Optimization, Inc.; written by David M. Gay. Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, @@ -31,6 +31,32 @@ of or in connection with the use or performance of this software. #include "obj_adj.h" /* for Objrep */ #include "r_qp.hd" /* for OPNUM */ + static void +suf_adjust(SufDesc *d, int k, int n) +{ + int i, i1, *ip; + real *rp; + + for(; d; d = d->next) { + if (d->kind & ASL_Sufkind_input) { + i = k; + if (d->kind & ASL_Sufkind_real) { + if ((rp = d->u.r)) { + for(; i < n; i = i1) { + i1 = i + 1; + rp[i] = rp[i1]; + } + } + } + else if ((ip = d->u.i)) + for(; i < n; i = i1) { + i1 = i + 1; + ip[i] = ip[i1]; + } + } + } + } + static void obj_adj1(ASL *asl, int no, int *pco, cgrad **pcgo, real *prhs) { @@ -222,8 +248,11 @@ obj_adj1(ASL *asl, int no, int *pco, cgrad **pcgo, real *prhs) --n_conjac[1]; if (n_conjac[1] > m) n_conjac[1] = m; - if (!cm && co != m) - cm = get_vcmap_ASL(asl, ASL_Sufkind_con); + if (co != m) { + suf_adjust(asl->i.suffixes[1], co, m); + if (!cm) + cm = get_vcmap_ASL(asl, ASL_Sufkind_con); + } if (cclass) { if (oclass) { i = oclass[no] = cclass[co]; @@ -272,6 +301,7 @@ obj_adj1(ASL *asl, int no, int *pco, cgrad **pcgo, real *prhs) } n_var = --n; if (cv != n) { + suf_adjust(asl->i.suffixes[0], cv, n); vm = get_vcmap_ASL(asl, ASL_Sufkind_var); nx = asl->i.n_var0 + asl->i.nsufext[ASL_Sufkind_var] - 1; for(i = cva; i < nx; ++i) diff --git a/src/solvers/psinfo.h b/src/solvers/psinfo.h index ca1c307..fa2dc4e 100644 --- a/src/solvers/psinfo.h +++ b/src/solvers/psinfo.h @@ -105,6 +105,9 @@ range { int *cei; /* common expressions: union over refs */ real *hest; /* nonzero ==> internal Hessian triangle */ /* computed by hvpinit */ +#ifdef MULTIPLE_THREADS + real *hsave; /* temp. threaded Hessian results, thlen numbers */ +#endif }; struct @@ -247,6 +250,7 @@ ps_info { int *iOscratch; /* length = nmax */ Ihinfo *ihi; Ihinfo *ihi1; /* first with positive count */ + size_t thlen; /* total number of scratch cells for threaded Hessians */ int hes_setup_called; int nmax; /* max{r in ranges} r->n */ int ihdcur; /* Current max internal Hessian dimension, */ @@ -255,6 +259,9 @@ ps_info { int ihdmin; /* min possible ihd > 0 and <= ihdmax, or 0 */ int khesoprod; /* used in new_Hesoprod in sputhes.c */ int pshv_g1; /* whether pshv_prod should multiply by g1 */ + int hesevalth; /* number of threads for sphes_ew_ASL */ + int hessetupth; /* number of threads for sphes_setup_ew_ASL */ + int hesvecth; /* number of threads for Hessian-vector products */ int linmultr; /* linear common terms used in more than one range */ int linhesfun; /* linear common terms in Hessian funnels */ int nlmultr; /* nonlin common terms used in more than one range */ @@ -265,6 +272,8 @@ ps_info { int npsgcomp; /* Has psgcomp been called? For sphes_setup. */ expr_va *valist; /* for sphes_setup */ expr_if *iflist; /* for sphes_setup */ + int sph_opts; /* options affecting threaded sphes and sphes_setup */ + int thused; /* number of threads most recenly used for Hessian computations */ int *zlsave; /* for S->_zl */ real *oyow; /* for xpsg_check */ int onobj; /* for xpsg_check */ diff --git a/src/solvers/xsum0.out b/src/solvers/xsum0.out index 9d3d915..a271d54 100644 --- a/src/solvers/xsum0.out +++ b/src/solvers/xsum0.out @@ -4,12 +4,12 @@ amplsolv.lbc e92fe2c2 1075 amplsolv.sy 5451119 1343 arith.ibm f398be2b 1391 arith.h0 e206209c 2309 -arith.h1 12e5e42c 1871 +arith.h1 f053bd25 177 arithchk.c 63b0185 6532 asl.h 1fe3527a 42925 asl_pfg.h 6483336 1268 asl_pfgh.h 49b5a53 1288 -asldate.c f734fb13 29 +asldate.c 8721c18 29 atof.c 1fabc7d3 1747 auxinfo.c 1a3c8690 1488 avltree.c 10aeaa58 11346 @@ -32,7 +32,7 @@ conval.c f1e4dc7d 4569 degree.c 3eafa26 4337 derprop.c e75c750f 1361 details.c0 fa7ec7b3 182 -dtoa.c 576045b 158447 +dtoa.c f84c88f9 159375 dtoa1.c 1928cdca 3352 duthes.c e13b176c 3874 dvalue.hd 9606f1 1125 @@ -44,7 +44,7 @@ fg_write.c 198882e0 17423 fgh_read.c ed246ea7 34450 float.h0 11d95cda 1822 fpecatch.c e1d8a914 1574 -fpinit.c e816f8aa 5665 +fpinit.c 66e5c26 5685 fpinitmt.c f58d6f11 4947 fpsetprec.s 1d0b2972 535 fpsetprec64.s 987339e 488 @@ -59,12 +59,12 @@ funcaddr.c fc16b89a 1197 g_fmt.c 642f235 3141 genrowno.c 1a8171dc 1805 getenv.c e86c81a2 1466 -getstub.c e05acb35 13344 +getstub.c e8e3042d 13362 getstub.h f626e91d 7079 htcl.c 6522bee 2279 indic_cons.c f1f99a7c 13970 jac0dim.c 7d6131d 7033 -jac2dim.c c2d8eba 1694 +jac2dim.c 1408b79e 1694 jac2dim.h 81b97e7 1982 jacdim.c e2ace738 1794 jacinc.c ef90060e 2276 @@ -80,7 +80,7 @@ makefile.vc 1bb69704 7500 makefile.wat c465856 5860 mip_pri.c f1dd1d47 7887 misc.c e6adc1b5 31635 -mpec_adj.c f15f4697 9223 +mpec_adj.c e13eb74e 9834 mpec_adj0.c efb8be74 70 mqpcheckv.c e60e70cf 19903 mypow.c f6f74614 2464 @@ -91,7 +91,7 @@ nlp2.h fcdcbc2a 7126 nqpcheck.c 1e266369 18358 nqpcheckZ.c ea215f98 1188 obj2val.c e939c11 4492 -obj_adj.c 1c990cd0 17623 +obj_adj.c f419c232 18138 obj_adj.h eed794cf 2218 obj_adj0.c ffe042be 67 obj_prec.c fd065560 1440 @@ -107,7 +107,7 @@ pfg_read.c 1dca51d4 98321 pfghread.c b815512 1107 printf.c 147de217 24293 pshvprod.c 1dcadea7 32089 -psinfo.h f251dd4 10201 +psinfo.h 1641cefc 10710 punknown.c fa46cf10 1704 qp_read.c eb2d7246 3986 qpcheck.c 1712f297 1588 diff --git a/src/solvers2/asl.h b/src/solvers2/asl.h index 26d1858..f58a973 100644 --- a/src/solvers2/asl.h +++ b/src/solvers2/asl.h @@ -381,17 +381,26 @@ Exitcall { #ifdef ALLOW_OPENMP #undef MULTIPLE_THREADS #define MULTIPLE_THREADS -#ifdef OPENMP_MBLK_LOCK #include "omp.h" #undef MBLK_LOCK #define MBLK_LOCK omp_lock_t -#define ACQUIRE_MBLK_LOCK(I,L) if (I->rd_M1z_bytes) omp_set_lock(&((I)->L)) -#define FREE_MBLK_LOCK(I,L) if (I->rd_M1z_bytes) omp_unset_lock(&((I)->L)) -#endif -#endif -#ifndef MBLK_LOCK +#define ACQUIRE_MBLK_LOCK(I,L) if ((I)->rd_M1z_bytes) omp_set_lock(&((I)->L)) +#define FREE_MBLK_LOCK(I,L) if ((I)->rd_M1z_bytes) omp_unset_lock(&((I)->L)) +#else +#ifdef NO_MBLK_LOCK +#undef MBLK_LOCK +#undef MULTIPLE_THREADS #define ACQUIRE_MBLK_LOCK(I,L) /*nothing*/ #define FREE_MBLK_LOCK(I,L) /*nothing*/ +#else +#include +#define MBLK_LOCK pthread_mutex_t +#ifndef MULTIPLE_THREADS +#define MULTIPLE_THREADS +#endif +#define ACQUIRE_MBLK_LOCK(I,L) pthread_mutex_lock(&((I)->L)) +#define FREE_MBLK_LOCK(I,L) pthread_mutex_unlock(&((I)->L)) +#endif #endif typedef struct @@ -413,6 +422,8 @@ EvalStats { typedef struct DerivErrInfo DerivErrInfo; typedef struct linarg linarg; + typedef struct ParHvInfo ParHvInfo; + typedef struct ParHvInit ParHvInit; struct EvalWorkspace { @@ -462,6 +473,7 @@ EvalWorkspace { real *oyow; /* for xpsg_check */ real *oyow0; /* for xpsg_check */ real *H0; /* for sphes */ + real *hvscratch; /* for parallel Hessian-vector products */ struct Hesoprod *hop_free, *hop_free0, *hop_free_end; struct uHeswork *uhw_free, *uhw_free0, *uhw_free_end; arglist *al; @@ -471,6 +483,8 @@ EvalWorkspace { int nlv; /* max(nlvc, nlvo) */ void *dbuginfo; EvalWorkspace *ewthread; + ParHvInfo *parhvinfo; + ParHvInit *parhvinit; EvalStats stats; }; @@ -492,9 +506,6 @@ MBavail { MBFree { uint nalloc, nfree; MBavail *a; -#ifdef MBLK_LOCK - MBLK_LOCK Lock; -#endif } MBFree; typedef struct @@ -506,9 +517,6 @@ Edaginfo { int nlmode; func_info **funcs_, *funcsfirst_, *funcslast_; int (*xscanf_)(EdRead*, const char*, ...); -#ifdef MBLK_LOCK - MBLK_LOCK MemLock, Mem1Lock; -#endif func_info *fhash_[NFHASH]; real *LUrhs_, /* constraint lower (and, if Urhsx == 0, */ @@ -768,6 +776,9 @@ Edaginfo { size_t ew_bytes; /* bytes allocated for one EvalWorkspace */ size_t n_ew0; /* number of EvalWorkspace structures allocated */ +#ifdef MBLK_LOCK + MBLK_LOCK MemLock, Mem1Lock; +#endif } Edaginfo; struct diff --git a/src/solvers2/asldate.c b/src/solvers2/asldate.c index bd80e37..6a530b8 100644 --- a/src/solvers2/asldate.c +++ b/src/solvers2/asldate.c @@ -1 +1 @@ -long ASLdate_ASL = 20240106; +long ASLdate_ASL = 20241111; diff --git a/src/solvers2/derprop.c b/src/solvers2/derprop.c index a59c534..b48d29a 100644 --- a/src/solvers2/derprop.c +++ b/src/solvers2/derprop.c @@ -22,7 +22,7 @@ software. void derprop(derpblock *db, real *s, real *w, real f) { - derp *d, *de; + derp *d, *de; size_t n; d = db->d0; diff --git a/src/solvers2/dtoa.c b/src/solvers2/dtoa.c index 20089ea..5ad2fa5 100644 --- a/src/solvers2/dtoa.c +++ b/src/solvers2/dtoa.c @@ -28,7 +28,8 @@ * does this with many compilers. Whether this or another call is * appropriate depends on the compiler; for this to work, it may be * necessary to #include "float.h" or another system-dependent header - * file. + * file. When needed, this call avoids double rounding, which can + * cause one bit errors, e.g., with strtod on 8.3e26 or 6.3876e-16. */ /* strtod for IEEE-, VAX-, and IBM-arithmetic machines. @@ -2716,13 +2717,14 @@ enum { /* rounding values: same as FLT_ROUNDS */ }; void -gethex( const char **sp, U *rvp, int rounding, int sign MTd) +gethex(const char **sp, U *rvp, int rounding, int sign MTd) { Bigint *b; + char d; const unsigned char *decpt, *s0, *s, *s1; Long e, e1; ULong L, lostbits, *x; - int big, denorm, esign, havedig, k, n, nbits, up, zret; + int big, denorm, esign, havedig, k, n, nb, nbits, nz, up, zret; #ifdef IBM int j; #endif @@ -2740,6 +2742,9 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) #endif #endif /*}}*/ }; +#ifdef IEEE_Arith + int check_denorm = 0; +#endif #ifdef USE_LOCALE int i; #ifdef NO_LOCALE_CACHE @@ -2891,7 +2896,7 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) k++; b = Balloc(k MTa); x = b->x; - n = 0; + havedig = n = nz = 0; L = 0; #ifdef USE_LOCALE for(i = 0; decimalpoint[i+1]; ++i); @@ -2906,22 +2911,28 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) if (*--s1 == '.') continue; #endif + if ((d = hexdig[*s1])) + havedig = 1; + else if (!havedig) { + e += 4; + continue; + } if (n == ULbits) { *x++ = L; L = 0; n = 0; } - L |= (hexdig[*s1] & 0x0f) << n; + L |= (d & 0x0f) << n; n += 4; } *x++ = L; b->wds = n = x - b->x; - n = ULbits*n - hi0bits(L); + nb = ULbits*n - hi0bits(L); nbits = Nbits; lostbits = 0; x = b->x; - if (n > nbits) { - n -= nbits; + if (nb > nbits) { + n = nb - nbits; if (any_on(b,n)) { lostbits = 1; k = n - 1; @@ -2934,8 +2945,8 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) rshift(b, n); e += n; } - else if (n < nbits) { - n = nbits - n; + else if (nb < nbits) { + n = nbits - nb; b = lshift(b, n MTa); e -= n; x = b->x; @@ -2990,12 +3001,49 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) return; } k = n - 1; +#ifdef IEEE_Arith + if (!k) { + switch(rounding) { + case Round_near: + if (((b->x[0] & 3) == 3) || (lostbits && (b->x[0] & 1))) { + multadd(b, 1, 1 MTa); + emin_check: + if (b->x[1] == (1 << (Exp_shift + 1))) { + rshift(b,1); + e = emin; + goto normal; + } + } + break; + case Round_up: + if (!sign && (lostbits || (b->x[0] & 1))) { + incr_denorm: + multadd(b, 1, 2 MTa); + check_denorm = 1; + lostbits = 0; + goto emin_check; + } + break; + case Round_down: + if (sign && (lostbits || (b->x[0] & 1))) + goto incr_denorm; + break; + } + } +#endif if (lostbits) lostbits = 1; else if (k > 0) lostbits = any_on(b,k); +#ifdef IEEE_Arith + else if (check_denorm) + goto no_lostbits; +#endif if (x[k>>kshift] & 1 << (k & kmask)) lostbits |= 2; +#ifdef IEEE_Arith + no_lostbits: +#endif nbits -= n; rshift(b,n); e = emin; @@ -3020,16 +3068,9 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) k = b->wds; b = increment(b MTa); x = b->x; - if (denorm) { -#if 0 - if (nbits == Nbits - 1 - && x[nbits >> kshift] & 1 << (nbits & kmask)) - denorm = 0; /* not currently used */ -#endif - } - else if (b->wds > k + if (!denorm && (b->wds > k || ((n = nbits & kmask) !=0 - && hi0bits(x[k-1]) < 32-n)) { + && hi0bits(x[k-1]) < 32-n))) { rshift(b,1); if (++e > Emax) goto ovfl; @@ -3039,8 +3080,10 @@ gethex( const char **sp, U *rvp, int rounding, int sign MTd) #ifdef IEEE_Arith if (denorm) word0(rvp) = b->wds > 1 ? b->x[1] & ~0x100000 : 0; - else + else { + normal: word0(rvp) = (b->x[1] & ~0x100000) | ((e + 0x3ff + 52) << 20); + } word1(rvp) = b->x[0]; #endif #ifdef IBM @@ -3617,10 +3660,11 @@ strtod(const char *s00, char **se) c = *++s; if (c > '0' && c <= '9') { L = c - '0'; - s1 = s; - while((c = *++s) >= '0' && c <= '9') - L = 10*L + c - '0'; - if (s - s1 > 8 || L > 19999) + while((c = *++s) >= '0' && c <= '9') { + if (L <= 19999) + L = 10*L + c - '0'; + } + if (L > 19999) /* Avoid confusion from exponents * so large that e might overflow. */ @@ -5416,7 +5460,8 @@ dtoa_r(double dd, int mode, int ndigits, int *decpt, int *sign, char **rve, char && (spec_case ? 4*res <= ulp : (2*res < ulp || dig & 1))) { ulp_reached: if (ures < res - || (ures == res && dig & 1)) + || (ures == res && dig & 1) + || (dig == 9 && 2*ures <= ulp)) goto Roundup; goto retc; } @@ -5662,7 +5707,7 @@ dtoa_r(double dd, int mode, int ndigits, int *decpt, int *sign, char **rve, char } } if (!(zb & ures) && (ures-rb) << (1 - eulp) < ulp) { - if ((ures + rb) << (1 - eulp) < ulp) + if ((ures + rb) << (2 - eulp) < ulp) goto Roundup; goto Fast_failed1; } diff --git a/src/solvers2/dtoa1.c b/src/solvers2/dtoa1.c index b3ebf61..f75d18d 100644 --- a/src/solvers2/dtoa1.c +++ b/src/solvers2/dtoa1.c @@ -63,6 +63,8 @@ FREE_DTOA_LOCK(unsigned int n) if (n < 2) omp_unset_lock(&Locks[n]); } +#elif defined(MULTIPLE_THREADS) /*}{*/ +#define dtoa_get_threadno pthread_self #endif /*}*/ #ifndef MALLOC /*{{*/ diff --git a/src/solvers2/ewalloc2.c b/src/solvers2/ewalloc2.c index 2cf216f..78ed716 100644 --- a/src/solvers2/ewalloc2.c +++ b/src/solvers2/ewalloc2.c @@ -25,8 +25,8 @@ of or in connection with the use or performance of this software. #include "fpu_control.h" #endif - static void -wk_init(real *w, int *z, real t) + void +wk_init_ASL(real *w, int *z, real t) { int i, n; @@ -60,7 +60,7 @@ ewalloc2_ASL(ASL *a) case ASL_read_pfgh: break; default: - fprintf(Stderr, "\nUnexpected ASLtype = %d in ewalloc2_AS(()\n", a->i.ASLtype); + fprintf(Stderr, "\nUnexpected ASLtype = %d in ewalloc2_ASL(()\n", a->i.ASLtype); fflush(Stderr); exit(1); } @@ -186,10 +186,10 @@ ewalloc2_ASL(ASL *a) } } if (asl->P.wkinit0) - wk_init(w, asl->P.wkinit0, 0.); + wk_init_ASL(w, asl->P.wkinit0, 0.); if (asl->P.wkinit2) - wk_init(w, asl->P.wkinit2, 2.); + wk_init_ASL(w, asl->P.wkinit2, 2.); if (asl->P.wkinitm1) - wk_init(w, asl->P.wkinitm1, -1.); + wk_init_ASL(w, asl->P.wkinitm1, -1.); return rv; } diff --git a/src/solvers2/fpinit.c b/src/solvers2/fpinit.c index e9063d7..edea0c4 100644 --- a/src/solvers2/fpinit.c +++ b/src/solvers2/fpinit.c @@ -49,6 +49,7 @@ int isatty_ASL; /* for use with "sw" under NT */ #undef WIN32 #define WIN32 #else +#define _GNU_SOURCE #include "fenv.h" #endif diff --git a/src/solvers2/getstub.c b/src/solvers2/getstub.c index 8003623..2d9721f 100644 --- a/src/solvers2/getstub.c +++ b/src/solvers2/getstub.c @@ -610,6 +610,7 @@ badopt_ASL(Option_Info *oi) { oi->n_badopts++; oi->option_echo &= ~ASL_OI_echothis; + oi->asl->i.need_nl_ = 0; } char * diff --git a/src/solvers2/htcl.c b/src/solvers2/htcl.c index 07a963a..3740d0e 100644 --- a/src/solvers2/htcl.c +++ b/src/solvers2/htcl.c @@ -45,11 +45,11 @@ new_mblk_ASL(ASL *asl, uint k) if (k >= MBLK_KMAX_ASL) memfailure_ASL("new_mblk", "arg too big", k); mb = &asl->i.mblk_free[k]; - ACQUIRE_MBLK_LOCK(&asl->i, mb->Lock); + ACQUIRE_MBLK_LOCK(&asl->i, MemLock); na = ++mb->nalloc; if ((a = mb->a)) mb->a = a->mbnext; - FREE_MBLK_LOCK(&asl->i, mb->Lock); + FREE_MBLK_LOCK(&asl->i, MemLock); if (!a) { mblk_mem_ASL += L = sizeof(MBhead) + (sizeof(void*)<h.klass) >= MBLK_KMAX_ASL) memfailure_ASL("Del_mblk", "invalid klass", (size_t)k); mb = &asl->i.mblk_free[k]; - ACQUIRE_MBLK_LOCK(&asl->i, mb->Lock); + ACQUIRE_MBLK_LOCK(&asl->i, MemLock); a->mbnext = mb->a; mb->a = a; - FREE_MBLK_LOCK(&asl->i, mb->Lock); + FREE_MBLK_LOCK(&asl->i, MemLock); } diff --git a/src/solvers2/misc.c b/src/solvers2/misc.c index 10f33e3..dea61e5 100644 --- a/src/solvers2/misc.c +++ b/src/solvers2/misc.c @@ -818,7 +818,7 @@ ASL_alloc(int k) if (!Stderr) Stderr_init_ASL(); /* set Stderr if necessary */ Mach_ASL(); -#ifdef MULTIPLE_THREADS +#ifdef ALLOW_OPENMP init_dtoa_locks(); set_max_dtoa_threads(1); #endif @@ -837,9 +837,9 @@ ASL_alloc(int k) h = a->p.h.next = ASLhead_ASL.next; a->p.h.prev = h->prev; h->prev = ASLhead_ASL.next = &a->p.h; -#ifdef MBLK_LOCK_INIT - for(n = 0; n < MBLK_KMAX_ASL; ++n) - MBLK_LOCK_INIT(&a->mblk_free[n].Lock); +#ifdef ALLOW_OPENMP + omp_init_lock(&a->i.MemLock); + omp_init_lock(&a->i.Mem1Lock); #endif return cur_ASL = a; } @@ -859,14 +859,14 @@ mem_ASL(ASL *asl, unsigned int len) #else len = (len + (sizeof(int)-1)) & ~(sizeof(int)-1); #endif - ACQUIRE_MBLK_LOCK(&asl->i, MemLock); + ACQUIRE_MBLK_LOCK((&asl->i),MemLock); memNext = asl->i.memNext; if (memNext + len >= asl->i.memLast) { memNext = (char *)M1alloc(k = Egulp*Sizeof(void*) + len); asl->i.memLast = memNext + k; } asl->i.memNext = memNext + len; - FREE_MBLK_LOCK(&asl->i, MemLock); + FREE_MBLK_LOCK((&asl->i),MemLock); return memNext; } diff --git a/src/solvers2/mpec_adj.c b/src/solvers2/mpec_adj.c index 32471a0..cb483d4 100644 --- a/src/solvers2/mpec_adj.c +++ b/src/solvers2/mpec_adj.c @@ -36,8 +36,6 @@ MPEC_Adjust { int n0; /* original number of variables */ }; - static int ZeroExpr[2] = { 0, -1 }; - static void reverse(int *a, int *b) { @@ -50,23 +48,50 @@ reverse(int *a, int *b) } } + static real +rhsconst(ASL *asl, int i) +{ + int j, *o; + + if (i < nlc) + goto ret0; /* constraint is nonlinear */ + switch(asl->i.ASLtype) { + case 1: + case 2: + o = ((ASL_fg*)asl)->I.con_de_[i].o.e; + break; + case 3: + case 4: + case 5: + o = ((ASL_fgh*)asl)->I.con2_de_[i].o.e; + break; + default: + fprintf(Stderr, "Unexpected ASLtype in rhsconst\n"); + exit(1); + o = 0; /* avoid bogus warning */ + } + if (o + && (j = o[1]) < 0 + && (j += asl->i.numlen/sizeof(real)) >= 0) + return asl->i.numvals[j]; + ret0: + return 0.; + } + void mpec_adjust_ASL(ASL *asl) { MPEC_Adjust *mpa; - cde *cd; - cde2 *cd2; - cgrad **Cgrd, **Cgrd1, **Cgrda, *cg, *cg1, *ncg, **pcg; + cgrad **Cgrd, **Cgrd1, **Cgrda, *cg, *ncg, **pcg; char *hx0; int *cc, *ck, *cv, *ind1, *ind2, *vm; - int i, incc, incv, j, k, m, m0, n, n0, n1, n2, nb, ncc, ncc0, nib, nib0; - int nnv, v1; + int i, incc, incv, j, k, m, m0, n, n0, n1, nb, ncc, ncc0, nib, nib0; + int nnv, v1, v2, v3, v4; real *Lc, *Lc0, *Lc1, *Lv, *Lv0, *Lv1, *Uc, *Uc0, *Uc1, *Uv, *Uv0, *Uv1; real a, b, *x; size_t nz, nz0, nznew; n = n0 = n_var; - n2 = asl->i.n_var0; nib = niv + nbv; n1 = n - nib; nib0 = n - nib; /* offset of first linear integer or binary variable */ @@ -136,7 +161,8 @@ mpec_adjust_ASL(ASL *asl) nzc = nZc = nz; n_cc = ncc; nznew = nz - nz0; - ncg = (cgrad*)M1alloc(2*(ncc + ncc0)*sizeof(int) + nznew*sizeof(cgrad) + ncg = (cgrad*)M1alloc(2*(ncc + ncc0)*sizeof(int) + + nznew*sizeof(cgrad) + ncc0*sizeof(cgrad*) + sizeof(MPEC_Adjust)); asl->i.mpa = mpa = (MPEC_Adjust*)(ncg + nznew); Cgrda = mpa->Cgrda = (cgrad**)(mpa + 1); @@ -191,10 +217,9 @@ mpec_adjust_ASL(ASL *asl) j += nnv; *cc++ = i; pcg = &Cgrd[i]; - cg = 0; - while((cg1 = *pcg)) - pcg = &(cg = cg1)->next; - *Cgrda++ = cg; + while((cg = *pcg)) + pcg = &cg->next; + *Cgrda++ = ncg; Lc = Lc0 + incv*i; Uc = Uc0 + incc*i; Lv = Lv0 + incv*--j; @@ -209,27 +234,29 @@ mpec_adjust_ASL(ASL *asl) /* v3 - v4 = body, v3 >= 0, v4 >= 0, */ /* v1 complements v3, v2 complements v4 */ - *Lc = *Uc = 0.; - *ind1++ = /*v1 =*/ n1++; - *ind1++ = /*v2 =*/ n1++; - *ind2++ = /*v3 =*/ n1++; - *ind2++ = /*v4 =*/ n1++; + *Lc = *Uc = -rhsconst(asl, i); + *ind1++ = v1 = n1++; + *ind1++ = v2 = n1++; + *ind2++ = v3 = n1++; + *ind2++ = v4 = n1++; for(k = 0; k < 4; ++k) { vset(Lv1, 0.); vset(Uv1, Infinity); } - ncg[1].varno = n2+3; + /* v3 - v4 = _con[i].body */ + ncg[1].varno = v4; ncg[1].coef = 1.; ncg[1].next = 0; - ncg[0].varno = n2+2; + ncg[0].varno = v3; ncg[0].coef = -1.; ncg[0].next = &ncg[1]; *pcg = ncg; ncg += 2; - ncg[1].varno = n2; + /* v1 = v - L */ + ncg[1].varno = v1; ncg[1].coef = -1.; ncg[1].next = 0; - ncg[0].varno = j; + ncg[0].varno = j /* v */; ncg[0].coef = 1.; ncg[0].next = &ncg[1]; *Lc1 = *Uc1 = *Lv; @@ -237,10 +264,11 @@ mpec_adjust_ASL(ASL *asl) Uc1 += incc; *Cgrd1++ = ncg; ncg += 2; - ncg[1].varno = n2+1; + /* v2 = U - v 0 */ + ncg[1].varno = v2; ncg[1].coef = 1.; ncg[1].next = 0; - ncg[0].varno = j; + ncg[0].varno = j /* v */; ncg[0].coef = 1.; ncg[0].next = &ncg[1]; *Lc1 = *Uc1 = *Uv; @@ -248,7 +276,6 @@ mpec_adjust_ASL(ASL *asl) Uc1 += incc; *Cgrd1++ = ncg; ncg += 2; - n2 += 4; } else { /*nb == 1*/ @@ -257,10 +284,9 @@ mpec_adjust_ASL(ASL *asl) /* For v = _svar[j], replace */ /* v >= a with v1 = v - a, v1 >= 0, or */ /* v <= b with v1 = b - v, v1 >= 0 */ - v1 = n1++; vset(Lv1, 0.); vset(Uv1, Infinity); - ncg[1].varno = n2++; + ncg[1].varno = v1 = n1++; ncg[1].next = 0; ncg[0].varno = j; ncg[0].coef = 1.; @@ -279,8 +305,8 @@ mpec_adjust_ASL(ASL *asl) ncg += 2; } *ind1++ = v1; - *ind2++ = n1++; - ncg->varno = n2++; + *ind2++ = v2 = n1++; + ncg->varno = v2; ncg->next = 0; vset(Lv1, 0.); vset(Uv1, Infinity); @@ -296,30 +322,30 @@ mpec_adjust_ASL(ASL *asl) } } #undef vset - i = m0; asl->i.n_con1 += k = m - m0; - switch(asl->i.ASLtype) { - case ASL_read_pfg: - memset(((ASL_pfg*)asl)->P.cps + m0, 0, k*sizeof(ps_func)); - cd = ((ASL_pfg*)asl)->I.con_de_; - goto have_cd; - case ASL_read_f: - case ASL_read_fg: - cd = ((ASL_fg*)asl)->I.con_de_; - have_cd: - while(i < m) - cd[i++].o.e = ZeroExpr; - break; - case ASL_read_fgh: - cd2 = ((ASL_fgh*)asl)->I.con2_de_; - goto have_cd2; - case ASL_read_pfgh: - memset(((ASL_pfgh*)asl)->P.cps + m0, 0, k*sizeof(ps_func2)); - cd2 = ((ASL_pfgh*)asl)->I.con2_de_; - have_cd2: - while(i < m) - cd2[i++].o.e = ZeroExpr; - } +// i = m0; +// switch(asl->i.ASLtype) { +// case ASL_read_pfg: +// memset(((ASL_pfg*)asl)->P.cps + m0, 0, k*sizeof(ps_func)); +// cd = ((ASL_pfg*)asl)->I.con_de_; +// goto have_cd; +// case ASL_read_f: +// case ASL_read_fg: +// cd = ((ASL_fg*)asl)->I.con_de_; +// have_cd: +// while(i < m) +// cd[i++].o.e = ZeroExpr; +// break; +// case ASL_read_fgh: +// cd2 = ((ASL_fgh*)asl)->I.con2_de_; +// goto have_cd2; +// case ASL_read_pfgh: +// memset(((ASL_pfgh*)asl)->P.cps + m0, 0, k*sizeof(ps_func2)); +// cd2 = ((ASL_pfgh*)asl)->I.con2_de_; +// have_cd2: +// while(i < m) +// cd2[i++].o.e = ZeroExpr; +// } } void @@ -351,7 +377,6 @@ mpec_auxvars_ASL(ASL *asl, real *c, real *x) vmi = get_vminv_ASL(asl); do { t = c[i = *cc++]; - c[i] = 0.; j = cv[i] - 1; for(cg = *Cga++; ; cg = cg->next) { if (!cg) @@ -359,11 +384,10 @@ mpec_auxvars_ASL(ASL *asl, real *c, real *x) if (cg->varno >= n0) break; } - if (!cg) - continue; Lc = Lc0 + i*incc; if (!*ck++) { - if (t >= 0.) + c[i] = 0.; + if ((t -= *Lc) >= 0.) x[vmi[cg->varno]] = t; else { cg = cg->next; @@ -380,10 +404,12 @@ mpec_auxvars_ASL(ASL *asl, real *c, real *x) } else { x[vmi[cg->varno]] = cg->coef*(*Lc - t); + /* actually (*Lc - t)/cg->coef, but cg->coef is 1 or -1 */ c[i] = *Lc; if (Lv0[incv*j] != 0.) { cg = (*Cg++)->next; x[vmi[cg->varno]] = cg->coef*(*Lc1 - x[j]); + Lc1 += incc; *ca++ = *Lc1; Lc1 += incc; } diff --git a/src/solvers2/obj_adj.c b/src/solvers2/obj_adj.c index 7e6a895..a7224e6 100644 --- a/src/solvers2/obj_adj.c +++ b/src/solvers2/obj_adj.c @@ -1,5 +1,5 @@ /******************************************************************* -Copyright (C) 2019, 2020 AMPL Optimization, Inc.; written by David M. Gay. +Copyright (C) 2019, 2020, 2024 AMPL Optimization, Inc.; written by David M. Gay. Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, @@ -29,6 +29,32 @@ of or in connection with the use or performance of this software. #undef psb_elem #include "obj_adj.h" /* for Objrep */ + static void +suf_adjust(SufDesc *d, int k, int n) +{ + int i, i1, *ip; + real *rp; + + for(; d; d = d->next) { + if (d->kind & ASL_Sufkind_input) { + i = k; + if (d->kind & ASL_Sufkind_real) { + if ((rp = d->u.r)) { + for(; i < n; i = i1) { + i1 = i + 1; + rp[i] = rp[i1]; + } + } + } + else if ((ip = d->u.i)) + for(; i < n; i = i1) { + i1 = i + 1; + ip[i] = ip[i1]; + } + } + } + } + static void obj_adj1(ASL *asl, int no, int *pco, cgrad **pcgo, real *prhs) { @@ -215,8 +241,11 @@ obj_adj1(ASL *asl, int no, int *pco, cgrad **pcgo, real *prhs) --n_conjac[1]; if (n_conjac[1] > m) n_conjac[1] = m; - if (!cm && co != m) - cm = get_vcmap_ASL(asl, ASL_Sufkind_con); + if (co != m) { + suf_adjust(asl->i.suffixes[1], co, m); + if (!cm) + cm = get_vcmap_ASL(asl, ASL_Sufkind_con); + } if (cclass) { if (oclass) { i = oclass[no] = cclass[co]; @@ -265,6 +294,7 @@ obj_adj1(ASL *asl, int no, int *pco, cgrad **pcgo, real *prhs) } n_var = --n; if (cv != n) { + suf_adjust(asl->i.suffixes[0], cv, n); vm = get_vcmap_ASL(asl, ASL_Sufkind_var); nx = asl->i.n_var0 + asl->i.nsufext[ASL_Sufkind_var] - 1; for(i = cva; i < nx; ++i) diff --git a/src/solvers2/pfghread.c b/src/solvers2/pfghread.c index 3bc6892..f905922 100644 --- a/src/solvers2/pfghread.c +++ b/src/solvers2/pfghread.c @@ -187,6 +187,7 @@ Static { int _nsce; int _nv0x; int _nzclim; + int rnsum; int slmax; int slscratchlev; int slscratchmax; @@ -607,10 +608,6 @@ new_opblock(Static *S, int need) return opnew; } -#ifdef DEBUG - /*DEBUG*/int zorkheswork, zorkheswork1, *zorkopnext; -#endif - static int* nextop(Static *S, int n) { @@ -623,10 +620,6 @@ nextop(Static *S, int n) op1 = op + n; } opnext = op1; -#ifdef DEBUG - /*DEBUG*/ if (op <= zorkopnext && op1 > zorkopnext) - /*DEBUG*/ printf(""); -#endif return op; } @@ -1154,17 +1147,28 @@ new_range(Static *S, range *r, range **rp) uilen = r->nv*sizeof(int); len = sizeof(range) + uilen; r1 = (range*)mem(len); - r1->nintv = 0; + r1->hest = r1->nintv = 0; n = r1->n = r->n; + S->rnsum += n; + r1->hsave = asl->P.hvhslen; if (n < (r1->nv = r->nv)) { + if (asl->P.rnmax1 < n) + asl->P.rnmax1 = n; r1->uhlen = 0; S->nhop += n*n; +#ifdef PSHVREAD + asl->P.thlen += (n*(n+1)) >> 1; + asl->P.hvhslen += n; +#endif } #ifdef PSHVREAD else { S->uhlen += r1->uhlen = sizeof(uHeswork) + (r->n-1)*sizeof(Ogptrs); if (asl->P.rnmax < n) asl->P.rnmax = n; + ++asl->P.nuhw; + asl->P.thlen += n; + asl->P.hvhslen += r->nv; } #endif r1->chksum = r->chksum; @@ -2971,7 +2975,7 @@ psfind(Static *S) #endif asl = S->asl; asl->P.krnmax = asl->P.rnmax = 0; - asl->P.dvsp0 = (int*)M1zapalloc(2*(Ncom)*sizeof(int)); + asl->P.dvsp0 = Ncom ? (int*)M1zapalloc(2*(Ncom)*sizeof(int)) : 0; asl->P.ndvsp = asl->P.dvsp0 + Ncom; m = asl->i.n_con0; dv_walk(S); @@ -5955,6 +5959,10 @@ pfg_read_ASL(ASL *a, FILE *nl, int flags) memset(X0, 0, nvr*sizeof(real)); if (havex0) memset(havex0, 0, nvr); + SS.rnsum = 0; +#ifdef PSHVREAD + asl->P.thlen = 0; +#endif asl->i.objconst = oc = (real*)M1zapalloc(x + no*sizeof(real)); con_de = Cde = (cde *)(oc + no); lcon_de = Lde = Cde + nc; @@ -6057,6 +6065,7 @@ pfg_read_ASL(ASL *a, FILE *nl, int flags) /* var_e[i].a for common variables i. */ adjust(S, flags); nzjac = nz; + asl->P.rnsum = SS.rnsum; #ifdef PSHVREAD a->p.Conival = conpival_ew_ASL; a->p.Conival_nomap = conpival_nomap_ew_ASL; @@ -6297,10 +6306,6 @@ heswork(int *o, int *oe) n = 0; while(o != oe) { -#ifdef DEBUG - /*DEBUG*/ if (++zorkheswork == zorkheswork1) - /*DEGUG*/ printf(""); -#endif switch(*o) { case OPCOPY0: @@ -6759,7 +6764,8 @@ hes_setup(Static *S) asl->P.khesoprod = 5; asl->P.nmax = nmax; asl->P.rtodo = h0 = h; - h += n = (nvx*sizeof(range*) + sizeof(real) - 1) / sizeof(real); + /* 20240616 changing asl->P.nran to asl->i.defvar0 in the next line */ + h += n = (asl->i.defvar0*sizeof(range*) + sizeof(real) - 1) / sizeof(real); asl->P.utodo = h; h += n; asl->P.otodo = h; diff --git a/src/solvers2/pshvprod.c b/src/solvers2/pshvprod.c index bcc5c40..7977ad6 100644 --- a/src/solvers2/pshvprod.c +++ b/src/solvers2/pshvprod.c @@ -26,6 +26,54 @@ extern "C" { #define alignarg(x) x #else #define alignarg(x) +#endif + +#ifdef ALLOW_OPENMP +#include +#define NO_PTHREADS +#endif +#ifdef NO_PTHREADS +#define PTHREADS(x) +#define pthread_mutex_lock(t) /*nothing*/ +#define pthread_mutex_unlock(t) /*nothing*/ +#ifndef ALLOW_OPENMP +#undef MULTIPLE_THREADS +#endif +#else +#define PTHREADS(x) x +#undef MULTIPLE_THREADS +#define MULTIPLE_THREADS +#include +#endif + +#ifdef MULTIPLE_THREADS + typedef struct { + ParHvInfo *tp; + int tno; + EvalWorkspace *ew; + range *r; + real *W; + real *hsave; /* temporary results */ + } Thparshv1; + +struct ParHvInfo { + ASL_pfgh *asl; + EvalWorkspace *ew; + Ihinfo *ihi; /* current */ + Thparshv1 *tpv1; + PTHREADS(pthread_mutex_t mutex;) + PTHREADS(pthread_t* T;) + int thpv1max; /* number of tpv1 structures */ + real *p; + real *ow; + real *y; + int nobj; + real *W; + real *hs0; /* hsave base */ + range *r; /* current */ + real align; /* make sure (ParHvInfo*)+1 is properly aligned */ + }; +extern void wk_init_ASL(real*, int*, real); #endif static void @@ -1284,8 +1332,8 @@ hfg_back(Ops *O, real *w) } } - static void -funnelhes(EvalWorkspace *ew) + void +funnelhes_ew_ASL(EvalWorkspace *ew) { ASL_pfgh *asl; Eresult *u; @@ -1377,7 +1425,7 @@ hvp0comp(EvalWorkspace *ew, real *hv, real *p, int nobj, real *ow, real *y) asl = (ASL_pfgh*)ew->asl; W = ew->w; if (ew->x0kind & ASL_need_funnel) - funnelhes(ew); + funnelhes_ew_ASL(ew); if (nobj >= 0 && nobj < n_obj) { ow = ow ? ow + nobj : &edag_one_ASL; no = nobj; @@ -1585,6 +1633,295 @@ dtmul(int n, real *x0, real *h, real *y0) } return x0; } +#ifdef MULTIPLE_THREADS + static int +hvtodo(ParHvInfo *tp, Thparshv1 *tp1) +{ + Ihinfo *ihi; + range *r; + int k; + + if (!(ihi = tp->ihi)) + return 0; + if (!(r = tp->r)) { + /* starting */ + if (!(r = ihi->r)) + return 0; + goto have_r; + } + if ((r = r->rlist.prev)) + goto have_r; + if (!(ihi = tp->ihi = ihi->next) || !(r = ihi->r)) + return 0; + tp->ihi = ihi; + have_r: + tp1->r = tp->r = r; + tp1->hsave = tp->hs0 + r->hsave; + return 1; + } + + EvalWorkspace* +ewalloc4_ASL(ASL_pfgh *asl, EvalWorkspace *ew) +{ + /* Similar to ewalloc2_ASL, but omitting H0. */ + + EvalWorkspace *rv; + Varval *v, *ve; + arglist *al; + char *s; + const char **sa; + func_info *fi; + int i, nc, nf, nlogc, nlv, no, nv, nv0, nvx; + real *d, *lastx, *ra, *w; + size_t extra, L, La, Le, Lo, Lu, Lx, Lxc, *nxc; + tfinfo **ptfi, *tfi; +#ifdef NANDEBUG + typedef union {real r; unsigned int x[2]; } U; + U u0, *u, *ue; +#endif + + switch (asl->i.ASLtype) { + /* case ASL_read_fgh: */ + /* case ASL_read_pfg: */ + case ASL_read_pfgh: + break; + default: + fprintf(Stderr, "\nUnexpected ASLtype = %d in ewalloc4(()\n", asl->i.ASLtype); + fflush(Stderr); + exit(1); + } + nc = nclcon; + nlogc = n_lcon; + no = n_obj; + nf = asl->i.nfinv; + nv = asl->i.n_var_; + La = nf * sizeof(arglist); + Lo = (nc + no)*sizeof(real); + Lu = asl->I.gscrx * sizeof(real); + Lx = x0len; + Le = (sizeof(EvalWorkspace) + sizeof(real) - 1) & ~(sizeof(real)-1); + Lxc = ((2*(nc + no)+ nlogc)*sizeof(size_t) + sizeof(real) - 1) & ~(sizeof(real)-1); + if ((nlv = nlvc) < nlvo) + nlv = nlvo; + L = La + Le + Lo + Lu + Lx + asl->i.derplen + + asl->i.wlen + asl->i.numlen + Lxc + + (asl->i.ra_max + 2*nv)*sizeof(real) + asl->i.sa_max*sizeof(char*); + if (L & (sizeof(real) - 1)) { + extra = sizeof(real) - (L & (sizeof(real) - 1)); + L += extra; + } + rv = (EvalWorkspace*) M1alloc(L); + ACQUIRE_MBLK_LOCK(&asl->i, MemLock); + ++asl->i.n_ew0; + *asl->i.pewthread = rv; + asl->i.pewthread = &rv->ewthread; + FREE_MBLK_LOCK(&asl->i, MemLock); + s = (char*)rv + Le; + memset(rv, 0, Le + Lxc); + rv->asl = (ASL*)asl; + rv->nlv = nlv; + rv->nxval = 1; + rv->x0kind = ASL_first_x; + nxc = (size_t*)s; + rv->ncxval = nxc; + rv->ncxgval = nxc += nc; + rv->nlxval = nxc += nc; + rv->noxval = nxc += nlogc; + rv->noxgval = nxc += no; + s += Lxc; + rv->hop_free = rv->hop_free0 = 0; + rv->uhw_free = rv->uhw_free0 = rv->uhw_free_end = 0; + rv->H0 = 0; + lastx = (real*)s; + s += Lx; + rv->unopscr = (real*)s; + s += Lu; + rv->oyow0 = (real*)s; + s += Lo; + memcpy(s, asl->i.numvals, asl->i.numlen); + rv->w = w = (real*)(s += asl->i.numlen); +#ifdef NANDEBUG + u0.x[0] = 0x12345678; + u0.x[1] = 0xfff7abcd; /* signalling NaN */ + u = (U*)w; + ue = (U*)(w + asl->P.rtodo); + while(u < ue) + *u++ = u0; + __fpu_control = _FPU_IEEE - _FPU_EXTENDED + _FPU_DOUBLE - _FPU_MASK_IM; + _FPU_SETCW(__fpu_control); + signal(SIGFPE, fpecatch_ASL); +#endif + + /* The following loop only matters under obscure conditions when setting */ + /* up Hessian compuations involving defined variables having linear parts */ + /* and used linearly in a constraint or objective. */ + /* With default AMPL settings, this situation does not arise. */ + v = (Varval*)w; + for(ve = v + nv; v < ve; ++v) + v->dO = v->aO = 0.; + + memset(w + asl->P.rtodo, 0, asl->P.zaplen); + rv->Lastx = lastx; + nv0 = asl->i.n_var0; + nvx = nv0 + asl->i.nsufext[ASL_Sufkind_var]; + rv->dv = (Varval*)w + nvx; + rv->dv1 = (Varval*)rv->dv + combc + como; + s += asl->i.wlen; + rv->hvscratch = (real*)s; + s += nv*(2*sizeof(real)); + rv->derps = d = (real*)s; + if ((i = asl->i.maxvar - asl->i.defvar0) > 0) + memset(d + asl->i.defvar0, 0, i*sizeof(real)); + s += asl->i.derplen; +/* rv->derpzap = d + nvx + combc + como + comc1 + como1; */ + rv->wantderiv = want_derivs; + rv->ndhmax = asl->P.ndhmax; + if (asl->i.X0_) { + memcpy(w, asl->i.X0_, nv0*sizeof(real)); + if (nv > nv0) + memset(w + nv0, 0, (nv-nv0)*sizeof(real)); + } + else + memset(w, 0, nv*sizeof(real)); + if (nf) { + ra = (real*)s; + sa = (const char**)(ra + asl->i.ra_max); + rv->al = al = (arglist*)(sa + asl->i.sa_max); + ptfi = (tfinfo**)asl->i.invd; + memset(al, 0, nf*sizeof(arglist)); + for(i = 0; i < nf; ++i, ++al) { + tfi = ptfi[i]; + al->n = al->nin = tfi->n; + if ((al->nr = tfi->nr)) + al->ra = ra; + al->at = tfi->at; + al->dig = tfi->wd; + if ((al->nsin = tfi->n - tfi->nr)) + al->sa = sa; + fi = tfi->fi; + al->f = (function*)fi; + al->funcinfo = fi->funcinfo; + al->AE = asl->i.ae; + } + } + if (asl->P.wkinit0) + wk_init_ASL(w, asl->P.wkinit0, 0.); + if (asl->P.wkinit2) + wk_init_ASL(w, asl->P.wkinit2, 2.); + if (asl->P.wkinitm1) + wk_init_ASL(w, asl->P.wkinitm1, -1.); + /*if (ew)*/ + memcpy(rv->w, ew->w, asl->i.wlen); + return rv; + } + + static void* +tstart3(void *arg) +{ + ASL_pfgh *asl; + EvalWorkspace *ew; + ParHvInfo *tp; + Thparshv1 *tp1; + Varval *V; + int j, n, nobj, nv, *ov, *ove, *ui, *uie; + linarg *la, **lap, **lape; + range *r; + real *hs, *hse, *oc, *ow, *p, *s, t, *w, *wh, *wi, *x, *y; + + tp1 = arg; + tp = tp1->tp; + ow = tp->ow; + p = tp->p; + y = tp->y; + nobj = tp->nobj; + asl = tp->asl; + if (!(ew = tp1->ew)) { + tp1->ew = ew = ewalloc4_ASL(asl, tp->ew); + tp1->W = ew->w; + } + else if (ew->nxval != asl->i.Ew0->nxval) + memcpy(ew->w, asl->i.Ew0->w, asl->i.wlen); + w = ew->w; + wh = ew->hvscratch; + x = wh + n_var; + V = (Varval*)w; + s = w + asl->P.dOscratch; + top: + r = tp1->r; + hs = tp1->hsave; + if (r->hest) { + n = r->n; + nv = r->nv; + wi = wh; + if (n < nv) { + lap = r->lap; + lape = lap + n; + do { + la = *lap++; + ov = la->ov; + ove = ov + la->nnz; + oc = la->oc; + t = p[*ov]**oc++; + while(++ov < ove) + t += p[*ov]**oc++; + *wi++ = t; + } + while(lap < lape); + wi = dtmul(n, x, w + r->hest, wh); + lap = r->lap; + do *hs++ = *wi++; + while(++lap < lape); + } + else { + ui = r->ui; + uie = ui + nv; + do *wi++ = p[*ui++]; + while(ui < uie); + wi = dtmul(nv, x, w + r->hest, wh); + hse = hs + nv; + do *hs++ = *wi++; + while(hs < hse); + } + } + else { + n = r->n; + wi = s; + lap = r->lap; + lape = lap + n; + do { + la = *lap++; + ov = la->ov; + ove = ov + la->nnz; + oc = la->oc; + t = p[*ov]**oc++; + while(++ov < ove) + t += p[*ov]**oc++; + *wi++ = t; + } + while(lap < lape); + pshv_prod_ASL(ew, r, nobj, ow, y); + lap = r->lap; + do { + la = *lap++; + *hs++ = V[la->u.v].aO; + } + while(lap < lape); + } + pthread_mutex_lock(&tp->mutex); +#ifdef ALLOW_OPENMP +#pragma omp critical + { +#endif + j = hvtodo(tp,tp1); + pthread_mutex_unlock(&tp->mutex); +#ifdef ALLOW_OPENMP + } +#endif + if (j) + goto top; + return 0; + } +#endif extern void hvpinit_nc_ASL(EvalWorkspace*, int, int, real*, real*); @@ -1604,6 +1941,17 @@ hvpcomp_ew_ASL(EvalWorkspace *ew, real *hv, real *p, int nobj, real *ow, real *y psg_elem *g, *ge; range *r; real *W, *cscale, *oc, *owi, t, t1, t2, *p0, *s, *w, *wi, *x; +#ifdef MULTIPLE_THREADS + ParHvInfo *phvi; + Thparshv1 *tp1, *tpi; + int m, maxit = 1, nrng; + PTHREADS(pthread_t *T;) + real *hs, *hs0; +#ifndef ALLOW_OPENMP + int it; + void *rc; +#endif +#endif a = ew->asl; W = ew->w; @@ -1646,10 +1994,142 @@ hvpcomp_ew_ASL(EvalWorkspace *ew, real *hv, real *p, int nobj, real *ow, real *y v->dO = t; v->aO = v->adO = 0; } +#ifdef MULTIPLE_THREADS + if ((m = asl->P.hesvecth) <= 0 || (asl->P.sph_opts & 128)) + m = 0; + else { + nrng = asl->P.nran; + if (m > nrng) + m = nrng; + if ((phvi = ew->parhvinfo)) { + if (m <= phvi->thpv1max) + goto have_phvi; + free(phvi); + } + ew->parhvinfo = phvi = (ParHvInfo*)M1alloc(sizeof(ParHvInfo) + + asl->P.hvhslen*sizeof(real) + + m*(sizeof(Thparshv1) + PTHREADS(+ sizeof(pthread_t*)))); + if (!ew->hvscratch) + ew->hvscratch = (real*)M1alloc(2*n_var*sizeof(real)); + phvi->asl = asl; + phvi->ew = ew; + phvi->nobj = nobj; + phvi->W = W; + phvi->ow = ow; + phvi->p = p; + phvi->y = y; + PTHREADS(pthread_mutex_init(&phvi->mutex, 0);) + phvi->thpv1max = m; + hs0 = phvi->hs0 = (real*)(phvi + 1); + tp1 = phvi->tpv1 = (Thparshv1*)(hs0 + asl->P.hvhslen); + for(j = 0; j < m; j++) { + tp1->tp = phvi; + tp1->tno = j; + tp1->ew = 0; + tp1->W = 0; + tp1++; + } + tp1 = phvi->tpv1; + tp1->W = W; + PTHREADS(phvi->T = (pthread_t*)tp1;) + tp1->ew = ew; + have_phvi: + hs0 = phvi->hs0; + PTHREADS(T = phvi->T;) + phvi->ihi = asl->P.ihi1; + tp1 = phvi->tpv1; + phvi->r = 0; +#ifdef ALLOW_OPENMP +#pragma omp parallel num_threads(m) + { + int it, jt; + Thparshv1 *tpi; + + it = omp_get_thread_num(); + tpi = tp1 + it; +#pragma omp critical + { + if ((jt = hvtodo(phvi,tpi)) && maxit <= it) + maxit = it + 1; + } + if (jt) + tstart3(tpi); + } +#else + pthread_mutex_lock(&phvi->mutex); + for(it = 1; it < m; ++it) { + tpi = tp1 + it; + if (!hvtodo(phvi,tpi)) + break; + if ((j = pthread_create(&T[it], 0, &tstart3, tpi))) + fprintf(Stderr, "hvpcomp_ew_ASL: return %d from pthread_create.\n", j); + } + maxit = it; + j = hvtodo(phvi,tp1); + pthread_mutex_unlock(&phvi->mutex); + if (j) + tstart3(tp1); + while(it > 1) { + if ((j = pthread_join(T[--it], &rc))) + fprintf(Stderr, + "Return %d from pthread_join for thread %d\n", + j, it); + } +#endif + asl->P.thusedhv = maxit; + for(ihi = asl->P.ihi1; ihi && (r = ihi->r); ihi = ihi->next) { + if (r->hest) + for(; r; r = r->rlist.prev) { + n = r->n; + nv = r->nv; + hs = hs0 + r->hsave; + if (n < nv) { + lap = r->lap; + lape = lap + n; + do if ((t = *hs++)) { + la = *lap; + ov = la->ov; + ove = ov + la->nnz; + oc = la->oc; + do hv[*ov++] += t**oc++; + while(ov < ove); + } + while(++lap < lape); + } + else { + ui = r->ui; + uie = ui + nv; + do hv[*ui++] += *hs++; + while(ui < uie); + } + } + else + for(; r; r = r->rlist.prev) { + n = r->n; + lap = r->lap; + lape = lap + n; + hs = W + r->hest; + do { + la = *lap++; + if ((t = *hs++)) { + ov = la->ov; + ove = ov + la->nnz; + oc = la->oc; + do hv[*ov++] += t**oc++; + while(ov < ove); + } + } + while(lap < lape); + } + } + goto pdone; + } +#endif + s = &W[asl->P.dOscratch]; kw = kp + 1; w = (real*)new_mblk(kw); x = w + n_var; - s = &W[asl->P.dOscratch]; ns = 0; for(ihi = asl->P.ihi1; ihi && (r = ihi->r); ihi = ihi->next) { if (r->hest) @@ -1732,6 +2212,9 @@ hvpcomp_ew_ASL(EvalWorkspace *ew, real *hv, real *p, int nobj, real *ow, real *y wi = s + ns; while(wi > s) *--wi = 0.; +#ifdef MULTIPLE_THREADS + pdone: +#endif if (asl->P.nobjgroups) { if (nobj >= 0 && nobj < n_obj) { owi = ow ? ow + nobj : &edag_one_ASL; @@ -1794,6 +2277,13 @@ hvpcomp_ew_ASL(EvalWorkspace *ew, real *hv, real *p, int nobj, real *ow, real *y while(hv < w) *hv++ *= *s++; } +#ifdef MULTIPLE_THREADS + if (asl->P.hesvecth > 0 && (asl->P.sph_opts & 64) && asl->P.thusedhv != maxit) { + printf("threads for Hv products = %d\n", maxit); + fflush(stdout); + } + asl->P.thusedhv = maxit; +#endif } /* Variant of hvpcomp_ew_ASL that has a final nerror argument, working @@ -1815,6 +2305,75 @@ hvpcompe_ew_ASL(EvalWorkspace *ew, real *hv, real *p, int nobj, real *ow, real * *Jp = Jsave; } +#ifdef DMG_DEBUG + static void +dbprint1(ASL_pfgh *asl, range *r, real *s, pthread_t tno) +{ + int i, n, *ov, *ove; + linarg *la, **lap, **lape; + real *oc, t; + + printf("pshv_prod th %ld {\n", tno); + printf("range %lx = %d x %d:", r, r->n, r->nv); + lap = r->lap; + lape = lap + (n = r->n); + while(lap < lape) { + la = *lap++; + ov = la->ov; + oc = la->oc; + t = *oc++; + if (t < 0.) { + t = -t; + printf("\n\t-"); + } + else + printf("\n\t"); + if (t != 1.) + printf("%g*", t); + ove = ov + la->nnz; + printf("x%d", *ov++); + while(ov < ove) { + t = *oc++; + if (t < 0.) { + printf(" - "); + t = -t; + } + else + printf(" + "); + if (t != 1.) + printf("%g*", t); + printf("x%d", *ov++); + } + } + for(i = 0; i < n; ++i) + if ((t = s[i])) + printf("\ns[%d] = %g", i, t); + printf("\n"); + } + + static void +dbprint2(EvalWorkspace *ew, range *r, pthread_t tno) +{ + Varval *V, *v; + int i, n; + linarg *la, **lap; + real t, *w; + + lap = r->lap; + n = r->n; + w = ew->w; + V = (Varval*)w; + for(i = 0; i < n; ++i) { + la = *lap++; + v = V + la->u.v; + if ((t = v->aO)) + printf("hv[%d] = %g\n", i, t); + } + printf("pshv_prod th %ld }\n", tno); + } +#endif + + void pshv_prod_ASL(EvalWorkspace *ew, range *r, int nobj, real *ow, real *y) { @@ -1829,6 +2388,9 @@ pshv_prod_ASL(EvalWorkspace *ew, range *r, int nobj, real *ow, real *y) psb_elem *b; psg_elem *g; real *cscale, *s, owi, t, *w; +#ifdef DMG_DEBUG + pthread_t tno; +#endif asl = (ASL_pfgh*)ew->asl; w = ew->w; @@ -1844,10 +2406,14 @@ pshv_prod_ASL(EvalWorkspace *ew, range *r, int nobj, real *ow, real *y) } } if (ew->x0kind & ASL_need_funnel) - funnelhes(ew); + funnelhes_ew_ASL(ew); s = &w[asl->P.dOscratch]; lap = r->lap; lape = lap + r->n; +#ifdef DMG_DEBUG + tno = pthread_self(); + dbprint1(asl, r, s, tno); +#endif while(lap < lape) { v = &V[(*lap++)->u.v]; v->dO = *s++; @@ -1923,6 +2489,9 @@ pshv_prod_ASL(EvalWorkspace *ew, range *r, int nobj, real *ow, real *y) x->adO += v->adO; } } +#ifdef DMG_DEBUG + dbprint2(ew, r, tno); +#endif } void diff --git a/src/solvers2/psinfo.h b/src/solvers2/psinfo.h index 5c652d3..943c4a1 100644 --- a/src/solvers2/psinfo.h +++ b/src/solvers2/psinfo.h @@ -134,6 +134,9 @@ range { int *cei; /* common expressions: union over refs */ int hest; /* nonzero ==> internal Hessian triangle */ /* computed by hvpinit starts at ew->w + hest */ + int hsave; /* offset of intermediate results during parallel */ + /* Hessian-vector products; length for this range: */ + /* min(n,nv). */ }; #ifndef PSHVREAD @@ -240,7 +243,7 @@ uHeswork { uHeswork *next; range *r; int *ui, *uie; - Ogptrs ogp[1]; /* scratch of length r->n */ + Ogptrs ogp[1]; /* scratch of length r->n */ }; typedef struct Ihinfo Ihinfo; @@ -286,6 +289,7 @@ ps_info { Ihinfo *ihi; Ihinfo *ihi1; /* first with positive count */ size_t zaplen; /* for zeroing memory starting at &w[asl->P.rtodo] */ + size_t thlen; /* total number of scratch cells for threaded Hessians */ int dOscratch, iOscratch, otodo, rtodo, utodo; /* subscripts in w for... */ int nmax; /* max{r in ranges} r->n */ int ihdmax; /* max possible ihd */ @@ -294,6 +298,9 @@ ps_info { int krnmax; /* based on rnmax (below); set in hvpinit_nc_ASL() */ int ndhmax; /* Initial wh->ndhmax */ int pshv_g1; /* whether pshv_prod should multiply by g1 */ + int hesevalth; /* number of threads for sphes_ew_ASL */ + int hessetupth; /* number of threads for sphes_setup_ew_ASL */ + int hesvecth; /* number of threads for Hessian-vector products */ int linmultr; /* linear common terms used in more than one range */ int linhesfun; /* linear common terms in Hessian funnels */ int nlmultr; /* nonlin common terms used in more than one range */ @@ -301,9 +308,21 @@ ps_info { int ncongroups; /* # of groups in constraints */ int nobjgroups; /* # of groups in objectives */ int rnmax; /* max r->n for ranges r with r->n >= r->nv */ + int rnmax1; /* max r->n for ranges r with r->n < r->nv */ + int sph_opts; /* options affecting threaded sphes and sphes_setup */ + int thusedsetup;/* number of threads used for sphes_setup */ + int thused; /* number of threads most recently used for Hessian computations */ + int thusedhv; /* number of threads most recently used for Hessian-vector prods */ + int nuhw; /* number of ranges with n >= nv */ + int nuhwmax; /* max number of ranges with n >= nv that affect any variable */ + /* computed by sphes if needed */ + int rnsum; /* sum of n values for ranges (for parallel Hessian-vector prods) */ + int hesmaxth; /* number of threads for which we have allocated memory */ + int hvhslen; /* length of storage for parallel Hessian-vector products */ int *zlsave; /* for S->_zl */ int *wkinit0, *wkinit2, *wkinitm1; /* For initializing components of w */ /* to 0, 2, -1, respectively. */ + real *hesparwk, **hestofree; #endif /* PSHVREAD */ split_ce *Split_ce; /* for sphes_setup */ } ps_info; @@ -404,6 +423,7 @@ tfinfo { extern real eval2_ASL(int*, EvalWorkspace*); extern void fullhes_ew_ASL(EvalWorkspace*, real*H, fint LH, int nobj, real*ow, real *y); extern void fullhese_ew_ASL(EvalWorkspace*, real*H, fint LH, int nobj, real*ow, real *y, fint*); + extern void funnelhes_ew_ASL(EvalWorkspace*); extern void hvpinit_ew_ASL(EvalWorkspace*, int hid_lim, int nobj, real *ow, real *y); extern void hvpinite_ew_ASL(EvalWorkspace*, int hid_lim, int nobj, real *ow, real *y, fint*); extern ASL_pfgh *pscheck_ASL(ASL*, const char*); diff --git a/src/solvers2/sphes.c b/src/solvers2/sphes.c index b13820f..f7bf39a 100644 --- a/src/solvers2/sphes.c +++ b/src/solvers2/sphes.c @@ -24,6 +24,24 @@ of or in connection with the use or performance of this software. #else #define alignarg(x) /*nothing*/ #endif +#ifdef ALLOW_OPENMP +/* asl.h has #defined MULTIPLE_THREADS */ +#include +#define NO_PTHREADS +#elif !defined(NO_PTHREADS) +#include +#ifndef MULTIPLE_THREADS +#define MULTIPLE_THREADS +#endif +#define PTHREADS(x) x +#endif +#ifdef NO_PTHREADS +#define pthread_mutex_lock(t) /*nothing*/ +#define pthread_mutex_unlock(t) /*nothing*/ +#define PTHREADS(x) /*nothing*/ +#endif +#undef exit +extern void wk_init_ASL(real*, int*, real); static void hv_fwd(int *o, real *w, int *oend) /* for determining sparsity */ @@ -886,7 +904,7 @@ new_uhw(EvalWorkspace *ew, range *r) uHeswork *rv = ew->uhw_free; ew->uhw_free = (uHeswork*)&rv->ogp[r->n]; if (ew->uhw_free > ew->uhw_free_end) { - /*DEBUG*/ fprintf(Stderr, "\n**** hop_free botch in new_Hesoprod!\n"); + /*DEBUG*/ fprintf(Stderr, "\n**** botch in new_uhw!\n"); exit(1); } return rv; @@ -1055,6 +1073,433 @@ upper_to_lower(EvalWorkspace *ew, SputInfo *spi, size_t nz) Del_mblk_ASL((ASL*)asl, rs); } +#ifdef MULTIPLE_THREADS + typedef struct RangeInfo RangeInfo; + struct +RangeInfo { + range *r; + real *hsave; /* for results of pshv_prod */ + uHeswork *uhw; + int i1; /* 0 <= i1 < r->n indicates r->lap[i1] */ + }; + + typedef struct { + ASL_pfgh *asl; + RangeInfo *ri, *ri0, *ria, *rie; + PTHREADS(pthread_mutex_t mutex;) + EvalWorkspace *ew; /* original workspace */ + SputInfo *spi; + uHeswork *uhw, **utodo; + int *rperm; + int i, iow, ir, iy, nobj, nr, nzc, start; + int *tdone; + linarg **lap, **lape; + real *ow, *y; + } Thpars; + + typedef struct { + int tno; /* thread number */ + int i; /* currently working on x[i] */ + int i1; /* currently working on ri->r->lap[i1] */ + Thpars *tp; + EvalWorkspace *ew; /* thread workspace and ASL pointer */ + RangeInfo *ri; /* current */ + uHeswork *uhw; /* current (n >= nv) */ + } Thpars1; + + static int +ttodo(Thpars *tp, Thpars1 *tp1) +{ + RangeInfo *ri; + linarg **lap; + range *r; + uHeswork *uhw; + + if (tp->start) { + tp->start = 0; + tp->ri = tp->ri0; + goto starting; + } + if ((lap = tp->lap) && lap < tp->lape) { + ri = tp1->ri = tp->ri; + tp1->uhw = 0; + tp1->i = tp->i; + tp1->i1 = lap - ri->r->lap; + tp->lap = lap + 1; + goto ret; + } + tp->ri++; + starting: + if (tp->ri >= tp->rie) { + tp1->ri = 0; + return 0; + } + tp1->ri = ri = tp->ri; + tp1->i = tp->i; + r = ri->r; + if ((uhw = ri->uhw) /*r->n >= r->nv*/) { + tp1->uhw = uhw; + tp->lap = 0; + tp1->i1 = ri->i1++; + } + else { + tp1->uhw = 0; + tp->lap = r->lap + 1; + tp->lape = r->lap + r->n; + tp1->i1 = 0; + } + ret: + return 1; + } + + static void* +tstart1(void *arg) +{ + ASL_pfgh *asl; + EvalWorkspace *ew; + Ogptrs *q, *qe; + RangeInfo *ri; + Thpars *tp; + Thpars1 *tp1; + Varval *V, *v; + int i, i1, j, n, nobj, ow, *ui, *uie, y; + linarg *la, **lap, **lape; + range *r; + real *hs, *s, *si, *w; + uHeswork *uhw; + + tp1 = arg; + tp = tp1->tp; + ow = tp->iow; + y = tp->iy; + nobj = tp->nobj; + ew = tp1->ew; + asl = (ASL_pfgh*)ew->asl; + w = ew->w; + V = (Varval*)w; + s = w + asl->P.dOscratch; + top: + i = tp1->i; + ri = tp1->ri; + r = ri->r; + lap = r->lap; + lape = lap + (n = r->n); + ui = r->ui; + uie = ui + r->nv; + while(ui < uie) { + v = V + *ui++; + v->aO = v->dO = v->adO = 0.; + } + hs = ri->hsave; + if ((uhw = tp1->uhw)) { + q = uhw->ogp; + qe = q + r->n; + si = s; + do { + if (q->ov < q->ove && *q->ov == i) + *si = 1; + si++; + } while(++q < qe); + pshv_prod1(ew, r, nobj, ow, y); + q = uhw->ogp; + si = s; + do { + if (q->ov < q->ove && *q->ov == i) { + *si = 0; + ++q->ov; + ++q->oc; + } + si++; + } while(++q < qe); + } + else { + i1 = tp1->i1; + si = s + i1; + *si = 1; + pshv_prod1(ew, r, nobj, ow, y); + *si = 0; + lap += i1; + hs += i1*n - i1*(i1-1)/2; + } + do { + la = *lap++; + *hs++ = V[la->u.v].aO; + } + while(lap < lape); + pthread_mutex_lock(&tp->mutex); +#ifdef ALLOW_OPENMP +#pragma omp critical + { +#endif + j = ttodo(tp,tp1); + pthread_mutex_unlock(&tp->mutex); +#ifdef ALLOW_OPENMP + } +#endif + if (j) + goto top; + return 0; + } + + static void* +tstart2(void *arg) +{ + ASL_pfgh *asl; + EvalWorkspace *ew; + Ogptrs *q, *qe; + RangeInfo *ri; + Thpars *tp; + Thpars1 *tp1; + Varval *V, *v; + int i, i1, j, n, nobj, *ui, *uie; + linarg *la, **lap, **lape; + range *r; + real *hs, *ow, *s, *si, *w, *y; + uHeswork *uhw; + + tp1 = arg; + tp = tp1->tp; + ow = tp->ow; + y = tp->y; + nobj = tp->nobj; + ew = tp1->ew; + asl = (ASL_pfgh*)ew->asl; + w = ew->w; + V = (Varval*)w; + s = w + asl->P.dOscratch; + top: + i = tp1->i; + ri = tp1->ri; + r = ri->r; + n = r->n; + si = s; + lap = r->lap; + lape = lap + n; + ui = r->ui; + uie = ui + r->nv; + while(ui < uie) { + v = V + *ui++; + v->aO = v->dO = v->adO = 0.; + } + hs = ri->hsave; + if ((uhw = tp1->uhw)) { + q = uhw->ogp; + qe = q + n; + do { + if (q->ov < q->ove && *q->ov == i) + *si = *q->oc; + si++; + } while(++q < qe); + pshv_prod_ASL(ew, r, nobj, ow, y); + q = uhw->ogp; + si = s; + do { + if (q->ov < q->ove && *q->ov == i) { + *si = 0; + ++q->ov; + ++q->oc; + } + si++; + } while(++q < qe); + } + else { + if ((i1 = tp1->i1)) { + hs += i1*n - i1*(i1-1)/2; + lap += i1; + } + si = s + i1; + *si = 1; + pshv_prod_ASL(ew, r, nobj, ow, y); + *si = 0; + } + do { + la = *lap++; + *hs++ = V[la->u.v].aO; + } + while(lap < lape); + pthread_mutex_lock(&tp->mutex); +#ifdef ALLOW_OPENMP +#pragma omp critical + { +#endif + j = ttodo(tp,tp1); + pthread_mutex_unlock(&tp->mutex); +#ifdef ALLOW_OPENMP + } +#endif + if (j) + goto top; + return 0; + } + + EvalWorkspace* +ewalloc3_ASL(ASL_pfgh *asl, EvalWorkspace *ew, int keep) +{ + /* Similar to ewalloc2_ASL, but omitting H0 and returning storage from */ + /* Malloc when keep == 0 (so we can can free the storage) rather than */ + /* M1alloc (keep == 1). */ + + EvalWorkspace *rv; + Varval *v, *ve; + arglist *al; + char *s; + const char **sa; + func_info *fi; + int i, nc, nf, nlogc, nlv, no, nv, nv0, nvx; + real *d, *lastx, *ra, *w; + size_t extra, L, La, Le, Lh, Lo, Lu, Luw, Lx, Lxc, *nxc; + tfinfo **ptfi, *tfi; +#ifdef NANDEBUG + typedef union {real r; unsigned int x[2]; } U; + U u0, *u, *ue; +#endif + + switch (asl->i.ASLtype) { + /* case ASL_read_fgh: */ + /* case ASL_read_pfg: */ + case ASL_read_pfgh: + break; + default: + fprintf(Stderr, "\nUnexpected ASLtype = %d in ewalloc3(()\n", asl->i.ASLtype); + fflush(Stderr); + exit(1); + } + nc = nclcon; + nlogc = n_lcon; + no = n_obj; + nf = asl->i.nfinv; + La = nf * sizeof(arglist); + Lo = (nc + no)*sizeof(real); + Lu = asl->I.gscrx * sizeof(real); + Lx = x0len; + Le = (sizeof(EvalWorkspace) + sizeof(real) - 1) & ~(sizeof(real)-1); + Lxc = ((2*(nc + no)+ nlogc)*sizeof(size_t) + sizeof(real) - 1) & ~(sizeof(real)-1); + Lh = ew ? 0 : (asl->I.nhop * sizeof(Hesoprod) + sizeof(real) - 1) & ~(sizeof(real)-1); +/* Luw = (asl->I.uhlen + sizeof(real) - 1) & ~(sizeof(real)-1); */ + Luw = 0; + if ((nlv = nlvc) < nlvo) + nlv = nlvo; + L = La + Le + Lh + Lo + Lu + Luw + Lx + asl->i.derplen + + asl->i.wlen + asl->i.numlen + Lxc + + asl->i.ra_max*sizeof(real) + asl->i.sa_max*sizeof(char*); + if (L & (sizeof(real) - 1)) { + extra = sizeof(real) - (L & (sizeof(real) - 1)); + L += extra; + } + rv = (EvalWorkspace*) (keep ? M1alloc(L) : Malloc(L)); + ACQUIRE_MBLK_LOCK(&asl->i, MemLock); + ++asl->i.n_ew0; + *asl->i.pewthread = rv; + asl->i.pewthread = &rv->ewthread; + FREE_MBLK_LOCK(&asl->i, MemLock); + s = (char*)rv + Le; + memset(rv, 0, Le + Lxc); + rv->asl = (ASL*)asl; + rv->nlv = nlv; + rv->nxval = 1; + rv->x0kind = ASL_first_x; + nxc = (size_t*)s; + rv->ncxval = nxc; + rv->ncxgval = nxc += nc; + rv->nlxval = nxc += nc; + rv->noxval = nxc += nlogc; + rv->noxgval = nxc += no; + s += Lxc; + if (ew) + rv->hop_free = rv->hop_free0 = 0; + else { + rv->hop_free0 = rv->hop_free = (Hesoprod*)s; + rv->hop_free_end = rv->hop_free0 + asl->I.nhop; + s += Lh; + } + rv->uhw_free = rv->uhw_free0 = rv->uhw_free_end = 0; +/* rv->uhw_free = rv->uhw_free0 =(uHeswork*)s; */ +/* rv->uhw_free_end = (uHeswork*)((char*)rv->uhw_free0 + asl->I.uhlen); */ +/* s += Luw; */ + rv->H0 = 0; + lastx = (real*)s; + s += Lx; + rv->unopscr = (real*)s; + s += Lu; + rv->oyow0 = (real*)s; + s += Lo; + memcpy(s, asl->i.numvals, asl->i.numlen); + rv->w = w = (real*)(s += asl->i.numlen); +#ifdef NANDEBUG + u0.x[0] = 0x12345678; + u0.x[1] = 0xfff7abcd; /* signalling NaN */ + u = (U*)w; + ue = (U*)(w + asl->P.rtodo); + while(u < ue) + *u++ = u0; + __fpu_control = _FPU_IEEE - _FPU_EXTENDED + _FPU_DOUBLE - _FPU_MASK_IM; + _FPU_SETCW(__fpu_control); + signal(SIGFPE, fpecatch_ASL); +#endif + nv = asl->i.n_var_; + + /* The following loop only matters under obscure conditions when setting */ + /* up Hessian compuations involving defined variables having linear parts */ + /* and used linearly in a constraint or objective. */ + /* With default AMPL settings, this situation does not arise. */ + v = (Varval*)w; + for(ve = v + nv; v < ve; ++v) + v->dO = v->aO = 0.; + + memset(w + asl->P.rtodo, 0, asl->P.zaplen); + rv->Lastx = lastx; + nv0 = asl->i.n_var0; + nvx = nv0 + asl->i.nsufext[ASL_Sufkind_var]; + rv->dv = (Varval*)w + nvx; + rv->dv1 = (Varval*)rv->dv + combc + como; + s += asl->i.wlen; + rv->derps = d = (real*)s; + if ((i = asl->i.maxvar - asl->i.defvar0) > 0) + memset(d + asl->i.defvar0, 0, i*sizeof(real)); + s += asl->i.derplen; +/* rv->derpzap = d + nvx + combc + como + comc1 + como1; */ + rv->wantderiv = want_derivs; + rv->ndhmax = asl->P.ndhmax; + if (asl->i.X0_) { + memcpy(w, asl->i.X0_, nv0*sizeof(real)); + if (nv > nv0) + memset(w + nv0, 0, (nv-nv0)*sizeof(real)); + } + else + memset(w, 0, nv*sizeof(real)); + if (nf) { + ra = (real*)s; + sa = (const char**)(ra + asl->i.ra_max); + rv->al = al = (arglist*)(sa + asl->i.sa_max); + ptfi = (tfinfo**)asl->i.invd; + memset(al, 0, nf*sizeof(arglist)); + for(i = 0; i < nf; ++i, ++al) { + tfi = ptfi[i]; + al->n = al->nin = tfi->n; + if ((al->nr = tfi->nr)) + al->ra = ra; + al->at = tfi->at; + al->dig = tfi->wd; + if ((al->nsin = tfi->n - tfi->nr)) + al->sa = sa; + fi = tfi->fi; + al->f = (function*)fi; + al->funcinfo = fi->funcinfo; + al->AE = asl->i.ae; + } + } + if (asl->P.wkinit0) + wk_init_ASL(w, asl->P.wkinit0, 0.); + if (asl->P.wkinit2) + wk_init_ASL(w, asl->P.wkinit2, 2.); + if (asl->P.wkinitm1) + wk_init_ASL(w, asl->P.wkinitm1, -1.); + if (ew) + memcpy(rv->w, ew->w, asl->i.wlen); + return rv; + } +#endif + fint sphes_setup_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, int nobj, int ow, int y, int uptri) { @@ -1064,10 +1509,24 @@ sphes_setup_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, int nobj, int ow, int y, Objrep **por; Ogptrs *q, *qe; SputInfo *spi, *spi1; +#ifdef MULTIPLE_THREADS + RangeInfo *ri, *ri0, *rie; + Thpars tp; + Thpars1 *tp1, *tpi; + int m, maxit = 1, nr; + real *hs, *hs0; + size_t nrng; +#ifndef ALLOW_OPENMP + pthread_t *T; + int it; + void *rc; +#endif +#endif Varval *V, *v; fint *hr, *hre, *hrownos, rv; - int i, j, khinfo, kz, n, n1, nhinfo, nlc0, no, noe, nov, nov1, nqslim, nzc; - int rfilter, robjno, *ov, *ov1, *ove, *ui, *zc, *zci; + int dassume, i, j, khinfo, kz; + int n, n1, nhinfo, nlc0, no, noe, nov, nov1, nqslim, nzc, nzc0; + int rfilter, robjno, *ov, *ov1, *ove, *ui, ushow, *zc, *zci; linarg *la, **lap, **lap1, **lape; ps_func *p, *pe; psb_elem *b; @@ -1082,6 +1541,7 @@ sphes_setup_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, int nobj, int ow, int y, w = ew->w; V = (Varval*)w; j = asl->i.maxvar; + dassume = asl->P.sph_opts & 1; for(i = asl->i.defvar0 + asl->P.ncom; i < j; ++i) { v = V + i; v->dO = v->aO = v->adO = 0.; @@ -1118,6 +1578,7 @@ sphes_setup_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, int nobj, int ow, int y, y = 1; if ((n = nlvo) < nlvc) n = nlvc; + ushow = asl->P.sph_opts & 2; if ((spi = *pspi)) { if (spi->ow == ow && spi->y == y && spi->nobj == nobj && spi->uptri == uptri) @@ -1132,10 +1593,11 @@ sphes_setup_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, int nobj, int ow, int y, ew->hes_setup_called = 3; otodo = otodoi = (Hesoprod**)(w + asl->P.otodo); rtodo = (range**)(w + asl->P.rtodo); + memset(rtodo, 0, asl->P.nran*sizeof(range*)); utodo = utodoi = (uHeswork**)(w + asl->P.utodo); s = w + asl->P.dOscratch; nqslim = n >> 3; - kz = htcl(2*sizeof(int)*n + asl->P.nran*sizeof(range)); + kz = htcl(2*sizeof(int)*n + asl->P.nran*sizeof(range*)); rnext = (range**)new_mblk_ASL(a, kz); zc = (int*)(rnext + asl->P.nran); zci = zc + n; @@ -1167,8 +1629,7 @@ sphes_setup_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, int nobj, int ow, int y, continue; } keep: - i = r->lasttermno; - rp = rtodo + i; + rp = rtodo + r->lasttermno; rnext[r->irange] = *rp; *rp = r; } @@ -1194,11 +1655,204 @@ sphes_setup_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, int nobj, int ow, int y, } } } +#ifdef MULTIPLE_THREADS + nrng = asl->P.nran; + if ((m = asl->P.hessetupth) <= 0 || (asl->P.sph_opts & 5) || nrng <= 0) { + m = 0; + hs0 = 0; + } + else { + if (m > nrng) + m = nrng; + tp.asl = asl; +#ifdef ALLOW_OPENMP + /*omp_set_num_threads(m);*/ +#else + pthread_mutex_init(&tp.mutex, 0); +#endif + tp.nobj = nobj; + tp.iow = ow; + tp.iy = y; + tp.utodo = utodo; + hs0 = hs = (real*)new_mblk(htcl(m*( +#ifndef ALLOW_OPENMP + sizeof(pthread_t*) + +#endif + sizeof(Thpars1)) + + asl->P.thlen*sizeof(real) + + nrng*(sizeof(RangeInfo)))); + tp1 = (Thpars1*)(hs + asl->P.thlen); + tp.ew = ew; + tp.spi = spi; +#ifdef ALLOW_OPENMP + tp.ri0 = ri0 = (RangeInfo*)(tp1 + m); +#else + T = (pthread_t*)(tp1 + m); + T[0] = pthread_self(); + tp.ri0 = ri0 = (RangeInfo*)(T + m); +#endif + tp.rie = rie = ri0 + nrng; + tp1->tno = 0; + tp1->tp = &tp; + tp1->ew = ew; + for(i = 0; i < m; ++i) { + tpi = tp1 + i; + tpi->tno = i; + tpi->tp = &tp; + if (i) + tpi->ew = ewalloc3_ASL(asl,0,0); + } + } +#endif for(i = 0; i < n; i++) { nzc = 0; rp = rtodo; uhwi = *utodoi; *utodoi++ = 0; +#ifdef MULTIPLE_THREADS /*{*/ + + if (m) { + ri = ri0; + hs = hs0; + while((r = *rp)) { + rp = rnext + r->irange; + nr = r->n; + if (nr >= r->nv) { + uhw = new_uhw(ew, r); + uhw->next = uhwi; + uhwi = uhw; + uhw->r = r; + uhw->uie = (uhw->ui = r->ui) + r->nv; + q = uhw->ogp; + lap = r->lap; + lape = lap + nr; + while(lap < lape) { + la = *lap++; + q->ove = (q->ov = la->ov) + la->nnz; + q->oc = la->oc; + ++q; + } + } + else { + ri->r = r; + ri->uhw = 0; + ri->hsave = hs; + hs += nr*(nr+1)/2; + ++ri; + } + } + tp.ria = ri; + while((uhw = uhwi)) { + uhwi = uhw->next; + r = ri->r = uhw->r; + ri->uhw = uhw; + ri->hsave = hs; + hs += r->n; + ++ri; + } + if (tp.ri0 >= (tp.rie = ri)) + goto uhw_done; + tp.i = i; + tp.start = 1; +#ifdef ALLOW_OPENMP +#pragma omp parallel num_threads(m) + { + int it, jt; + Thpars1 *tpi; + + it = omp_get_thread_num(); + tpi = tp1 + it; +#pragma omp critical + { + if ((jt = ttodo(&tp,tpi)) && maxit <= it) + maxit = it + 1; + } + if (jt) + tstart1(tpi); + } +#else + pthread_mutex_lock(&tp.mutex); + for(it = 1; it < m; ++it) { + tpi = tp1 + it; + tpi->ri = 0; + if (!ttodo(&tp,tpi)) + break; + if ((j = pthread_create(&T[it], 0, &tstart1, tpi))) + fprintf(Stderr, "Return %d from pthread_create for tstart1.\n", j); + } + maxit = it; + j = ttodo(&tp,tp1); + pthread_mutex_unlock(&tp.mutex); + if (j) { + tp1->tp = &tp; + tstart1(tp1); + } + while(it > 1) { + if ((j = pthread_join(T[--it], &rc))) + fprintf(Stderr, + "Return %d from pthread_join for thread %d\n", + j, it); + } +#endif + for(ri = ri0; ri < tp.ria; ++ri) { + r = ri->r; + lap = r->lap; + lape = lap + r->n; + hs = ri->hsave; + while(lap < lape) { + lap1 = lap++; + la = *lap1++; + ov = la->ov; + nov = la->nnz; + if ((t = *hs++)) + new_Hesoprod(ew,nov,ov,0,nov,ov,0,t); + while(lap1 < lape) { + la = *lap1++; + if ((t = *hs++)) { + ov1 = la->ov; + nov1 = la->nnz; + new_Hesoprod(ew,nov,ov,0,nov1,ov1,0,t); + new_Hesoprod(ew,nov1,ov1,0,nov,ov,0,t); + } + } + } + } + for(; ri < tp.rie; ++ri) { + uhw = ri->uhw; + r = uhw->r; + lap = r->lap; + lape = lap + r->n; + hs = ri->hsave; + kz = 0; + nzc0 = nzc; + do { + if (*hs++) { + ++kz; + la = *lap; + ov = la->ov; + for(ove = ov + la->nnz; ov < ove; ++ov) + if ((j = *ov) <= i + && !zc[j]++) + zci[nzc++] = j; + } + } + while(++lap < lape); + if (ushow) { + if (r->n > r->nv) + printf("obrange %d\t%d", r->n, r->nv); + else + printf("sqrange %d", r->n); + printf("\t%d\t%d\n", kz, nzc-nzc0); + } + if ((ui = ++uhw->ui) < uhw->uie) { + utodoj = utodo + *ui; + uhw->next = *utodoj; + *utodoj = uhw; + } + } + } + else +#endif /*}*/ while((r = *rp)) { rp = rnext + r->irange; lap = r->lap; @@ -1243,34 +1897,64 @@ sphes_setup_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, int nobj, int ow, int y, } } } - *rtodo++ = 0; /* reset */ +#ifdef MULTIPLE_THREADS + if (m) + goto uhw_done; +#endif while((uhw = uhwi)) { uhwi = uhwi->next; si = s; r = uhw->r; q = uhw->ogp; qe = q + r->n; - si = s; do { if (q->ov < q->ove && *q->ov == i) *si = 1.; si++; } while(++q < qe); - pshv_prod1(ew, r, nobj, ow, y); - lap = r->lap; lape = lap + r->n; - do { - la = *lap++; - if (V[la->u.v].aO) { + if (dassume) { + if (ushow) { + if (r->n > r->nv) + printf("obrange %d %d\n", r->n, r->nv); + else + printf("sqrange %d\n", r->n); + } + do { + la = *lap++; ov = la->ov; for(ove = ov + la->nnz; ov < ove; ++ov) if ((j = *ov) <= i && !zc[j]++) zci[nzc++] = j; } + while(lap < lape); + } + else { + pshv_prod1(ew, r, nobj, ow, y); + kz = 0; + nzc0 = nzc; + do { + la = *lap++; + if (V[la->u.v].aO) { + ++kz; + ov = la->ov; + for(ove = ov + la->nnz; ov < ove; ++ov) + if ((j = *ov) <= i + && !zc[j]++) + zci[nzc++] = j; + } + } + while(lap < lape); + if (ushow) { + if (r->n > r->nv) + printf("obrange %d\t%d", r->n, r->nv); + else + printf("sqrange %d", r->n); + printf("\t%d\t%d\n", kz, nzc-nzc0); + } } - while(lap < lape); q = uhw->ogp; si = s; @@ -1287,7 +1971,10 @@ sphes_setup_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, int nobj, int ow, int y, *utodoj = uhw; } } - +#ifdef MULTIPLE_THREADS + uhw_done: +#endif + *rtodo++ = 0; /* reset */ hop1 = *otodoi; *otodoi++ = 0; while((hop = hop1)) { @@ -1358,7 +2045,19 @@ sphes_setup_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, int nobj, int ow, int y, spi->ulcopy = 0; spi->uptolow = 0; del_mblk(rnext); +#ifdef MULTIPLE_THREADS + if (hs0) + del_mblk(hs0); +#endif done: +#ifdef MULTIPLE_THREADS + asl->P.thusedsetup = maxit; + if (asl->P.hessetupth > 0 && (asl->P.sph_opts & 16)) { + printf("threads for sph_setup = %d\n", maxit); + fflush(stdout); + } + asl->P.thused = 0; +#endif spi->hrownos = spi->hrn[0]; spi->hcolstartsZ = hcolstarts = spi->hcs[0]; rv = hcolstarts[n] - hcolstarts[0]; @@ -1391,6 +2090,19 @@ sphes_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, real *H, int nobj, real *ow, re Ogptrs *q, *qe; Hesoprod *hop, *hop1, **otodo, **otodoi, **otodoj; SputInfo*spi, *spi0; +#ifdef MULTIPLE_THREADS + RangeInfo *ri, *ri0, *rie; + Thpars tp; + Thpars1 *tp1, *tpi; + int m, maxit = 1, nr; + real *hs, *hs0; + size_t nrng; +#ifndef ALLOW_OPENMP + pthread_t *T; + int it; + void *rc; +#endif +#endif Varval *V, *v; fint *hr; int i, j, k, n, no, noe, nov, nov1, *ov, *ov1, *ove, *ui, *uie, uptri; @@ -1399,7 +2111,7 @@ sphes_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, real *H, int nobj, real *ow, re psg_elem *g, *ge; range *r, *r0, **rnext, **rp, **rtodo; real *Hi, *H0, *H00, *cscale, *oc, *oc1, *owi, *s, *si; - real t, t1, *vsc0, *vsc1, *vsc, *w, *y1; + real t, t1, *vsc, *vsc0, *vsc1, *w, *y1; size_t *hcs; ssize_t *ulc, *uli; uHeswork *uhw, *uhwi, **utodo, **utodoi, **utodoj; @@ -1445,6 +2157,7 @@ sphes_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, real *H, int nobj, real *ow, re spi = *pspi; otodo = otodoi = (Hesoprod**)(w + asl->P.otodo); rtodo = (range**)(w + asl->P.rtodo); + memset(rtodo, 0, asl->P.nran*sizeof(range*)); utodo = utodoi = (uHeswork**)(w + asl->P.utodo); s = w + asl->P.dOscratch; n = ew->nlv; @@ -1516,57 +2229,178 @@ sphes_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, real *H, int nobj, real *ow, re vsc = asl->i.vscale; vsc0 = vsc - Fortran; vsc1 = vsc; + if (ew->x0kind & ASL_need_funnel) + funnelhes_ew_ASL(ew); +#ifdef MULTIPLE_THREADS + if ((m = asl->P.hesevalth) < 1 || (asl->P.sph_opts & 8) || n <= 0) { + m = 0; + hs0 = 0; + } + else { + if (m > n) + m = n; + tp.asl = asl; +#ifdef ALLOW_OPENMP + /*omp_set_num_threads(m);*/ +#else + pthread_mutex_init(&tp.mutex, 0); +#endif + tp.nobj = nobj; + tp.ow = ow; + tp.y = y; + tp.utodo = utodo; + nrng = asl->P.nran; + if ((hs = asl->P.hesparwk) && m > asl->P.hesmaxth) { + free(hs); + *asl->P.hestofree = 0; + hs0 = 0; + goto alloc_anew; + } + if (!(hs0 = hs = asl->P.hesparwk)) { + alloc_anew: + asl->P.hesparwk = + hs = (real*)M1alloc(m*( +#ifndef ALLOW_OPENMP + sizeof(pthread_t*) + +#endif + sizeof(Thpars1)) + + asl->P.thlen*sizeof(real) + + nrng*(sizeof(RangeInfo))); + asl->P.hestofree = (real**)(asl->i.Mbnext - 1); /* kludge */ + asl->P.hesmaxth = m; + } + tp1 = (Thpars1*)(hs + asl->P.thlen); + tp.ew = ew; + tp.spi = spi; +#ifdef ALLOW_OPENMP + tp.ri0 = ri0 = (RangeInfo*)(tp1 + m); +#else + T = (pthread_t*)(tp1 + m); + T[0] = pthread_self(); + tp.ri0 = ri0 = (RangeInfo*)(T + m); +#endif + tp.rie = rie = ri0 + nrng; + tp1->tno = 0; + tp1->tp = &tp; + tp1->ew = ew; + if (!hs0) { + hs0 = hs; + for(i = 0; i < m; ++i) { + tpi = tp1 + i; + tpi->tno = i; + tpi->tp = &tp; + if (i) + tpi->ew = ewalloc3_ASL(asl,ew,1); + } + } + } +#endif for(i = 0; i < n; i++) { rp = rtodo; uhwi = *utodoi; *utodoi++ = 0; - while((r = *rp)) { - rp = rnext + r->irange; - lap = r->lap; - lape = lap + r->n; - ui = r->ui; - uie = ui + r->nv; - while(ui < uie) { - v = V + *ui++; - v->aO = v->dO = v->adO = 0.; - } - if (r->n >= r->nv) { - uhw = new_uhw(ew, r); - uhw->next = uhwi; - uhwi = uhw; - uhw->r = r; - uhw->ui = r->ui; - uhw->uie = uie; - q = uhw->ogp; - while(lap < lape) { - la = *lap++; - q->ove = (q->ov = la->ov) + la->nnz; - q->oc = la->oc; - ++q; +#ifdef MULTIPLE_THREADS /*{*/ + if (m) { + ri = ri0; + hs = hs0; + while((r = *rp)) { + rp = rnext + r->irange; + nr = r->n; + if (nr >= r->nv) { + uhw = new_uhw(ew, r); + uhw->next = uhwi; + uhwi = uhw; + uhw->r = r; + uhw->uie = (uhw->ui = r->ui) + r->nv; + q = uhw->ogp; + lap = r->lap; + lape = lap + r->n; + while(lap < lape) { + la = *lap++; + q->ove = (q->ov = la->ov) + la->nnz; + q->oc = la->oc; + ++q; + } + } + else { + ri->r = r; + ri->uhw = 0; + ri->hsave = hs; + hs += nr*(nr+1)/2; + ri->i1 = 0; + ++ri; } } - else { - si = s; + tp.ria = ri; + while((uhw = uhwi)) { + uhwi = uhw->next; + r = ri->r = uhw->r; + ri->uhw = uhw; + ri->hsave = hs; + hs += r->n; + ri->i1 = 0; + ++ri; + } + if (tp.ri0 >= (tp.rie = ri)) + goto uhw_done; + tp.i = i; + tp.start = 1; +#ifdef ALLOW_OPENMP +#pragma omp parallel num_threads(m) + { + int it, jt; + Thpars1 *tpi; + + it = omp_get_thread_num(); + tpi = tp1 + it; +#pragma omp critical + { + if ((jt = ttodo(&tp,tpi)) && maxit <= it) + maxit = it + 1; + } + if (jt) + tstart2(tpi); + } +#else + pthread_mutex_lock(&tp.mutex); + for(it = 1; it < m; ++it) { + tpi = tp1 + it; + tpi->ri = 0; + if (!ttodo(&tp,tpi)) + break; + if ((j = pthread_create(&T[it], 0, &tstart2, tpi))) + fprintf(Stderr, "Return %d from pthread_create.\n", j); + } + maxit = it; + j = ttodo(&tp,tp1); + pthread_mutex_unlock(&tp.mutex); + if (j) { + tp1->tp = &tp; + tstart2(tp1); + } + while(it > 1) { + if ((j = pthread_join(T[--it], &rc))) + fprintf(Stderr, + "Return %d from pthread_join for thread %d\n", + j, it); + } +#endif + for(ri = ri0; ri < tp.ria; ++ri) { + r = ri->r; + lap = r->lap; + lape = lap + r->n; + hs = ri->hsave; while(lap < lape) { - la = *lap++; - lap1 = lap; + lap1 = lap++; + la = *lap1++; ov = la->ov; oc = la->oc; nov = la->nnz; - *si = 1; - for(j = 0; j < nov; ++ j) - V[ov[j]].dO = oc[j]; - pshv_prod_ASL(ew, r, nobj, ow, y); - for(j = 0; j < nov; ++ j) - V[ov[j]].dO = 0.; - *si++ = 0; - v = V + la->u.v; - if ((t = v->aO)) - new_Hesoprod(ew, nov, ov, oc, nov, ov, oc, t); + if ((t = *hs++)) + new_Hesoprod(ew,nov,ov,oc,nov,ov,oc,t); while(lap1 < lape) { la = *lap1++; - v = V + la->u.v; - if ((t = v->aO)) { + if ((t = *hs++)) { ov1 = la->ov; oc1 = la->oc; nov1 = la->nnz; @@ -1576,53 +2410,136 @@ sphes_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, real *H, int nobj, real *ow, re } } } + for(; ri < tp.rie; ++ri) { + uhw = ri->uhw; + r = uhw->r; + lap = r->lap; + lape = lap + r->n; + hs = ri->hsave; + do { + if ((t = *hs++)) { + la = *lap; + ov = la->ov; + oc = la->oc; + for(ov1 = ov + la->nnz; ov < ov1; ++ov, ++oc) + if ((j = *ov) <= i) + Hi[j] += t**oc; + } + } + while(++lap < lape); + if ((ui = ++uhw->ui) < uhw->uie) { + utodoj = utodo + *ui; + uhw->next = *utodoj; + *utodoj = uhw; + } + } } - *rtodo++ = 0; /* reset */ - while((uhw = uhwi)) { - uhwi = uhwi->next; - r = uhw->r; - q = uhw->ogp; - qe = q + r->n; - si = s; - do { - if (q->ov < q->ove && *q->ov == i) - *si = *q->oc; - si++; - } while(++q < qe); - - pshv_prod_ASL(ew, r, nobj, ow, y); - - lap = r->lap; - lape = lap + r->n; - do { - la = *lap++; - if ((t = V[la->u.v].aO)) { - ov = la->ov; - oc = la->oc; - for(ov1 = ov + la->nnz; ov < ov1; ++ov, ++oc) - if ((j = *ov) <= i) - Hi[j] += t**oc; + else +#endif /*}*/ + { + while((r = *rp)) { + rp = rnext + r->irange; + lap = r->lap; + lape = lap + r->n; + ui = r->ui; + uie = ui + r->nv; + while(ui < uie) { + v = V + *ui++; + v->aO = v->dO = v->adO = 0.; + } + if (r->n >= r->nv) { + uhw = new_uhw(ew, r); + uhw->next = uhwi; + uhwi = uhw; + uhw->r = r; + uhw->ui = r->ui; + uhw->uie = uie; + q = uhw->ogp; + while(lap < lape) { + la = *lap++; + q->ove = (q->ov = la->ov) + la->nnz; + q->oc = la->oc; + ++q; + } + } + else { + si = s; + while(lap < lape) { + la = *lap++; + lap1 = lap; + ov = la->ov; + oc = la->oc; + nov = la->nnz; + *si = 1; + for(j = 0; j < nov; ++ j) + V[ov[j]].dO = oc[j]; + pshv_prod_ASL(ew, r, nobj, ow, y); + for(j = 0; j < nov; ++ j) + V[ov[j]].dO = 0.; + *si++ = 0; + v = V + la->u.v; + if ((t = v->aO)) + new_Hesoprod(ew, nov, ov, oc, nov, ov, oc, t); + while(lap1 < lape) { + la = *lap1++; + v = V + la->u.v; + if ((t = v->aO)) { + ov1 = la->ov; + oc1 = la->oc; + nov1 = la->nnz; + new_Hesoprod(ew,nov,ov,oc,nov1,ov1,oc1,t); + new_Hesoprod(ew,nov1,ov1,oc1,nov,ov,oc,t); + } + } + } } } - while(lap < lape); - - q = uhw->ogp; - si = s; - do { - if (q->ov < q->ove && *q->ov == i) { - *si = 0; - ++q->ov; - ++q->oc; + while((uhw = uhwi)) { + uhwi = uhwi->next; + r = uhw->r; + q = uhw->ogp; + qe = q + r->n; + si = s; + do { + if (q->ov < q->ove && *q->ov == i) + *si = *q->oc; + si++; + } while(++q < qe); + pshv_prod_ASL(ew, r, nobj, ow, y); + lap = r->lap; + lape = lap + r->n; + do { + la = *lap++; + if ((t = V[la->u.v].aO)) { + ov = la->ov; + oc = la->oc; + for(ov1 = ov + la->nnz; ov < ov1; ++ov, ++oc) + if ((j = *ov) <= i) + Hi[j] += t**oc; + } + } + while(lap < lape); + q = uhw->ogp; + si = s; + do { + if (q->ov < q->ove && *q->ov == i) { + *si = 0; + ++q->ov; + ++q->oc; + } + si++; + } while(++q < qe); + if ((ui = ++uhw->ui) < uhw->uie) { + utodoj = utodo + *ui; + uhw->next = *utodoj; + *utodoj = uhw; } - si++; - } while(++q < qe); - if ((ui = ++uhw->ui) < uhw->uie) { - utodoj = utodo + *ui; - uhw->next = *utodoj; - *utodoj = uhw; } } - +#ifdef MULTIPLE_THREADS + uhw_done: +#endif + *rtodo++ = 0; /* reset */ hop1 = *otodoi; *otodoi++ = 0; while((hop = hop1)) { @@ -1658,12 +2575,20 @@ sphes_ew_ASL(EvalWorkspace *ew, SputInfo **pspi, real *H, int nobj, real *ow, re H0[j] = 0; } } - else + else { while(--k >= 0) { *H++ = H0[j = (int)*hr++]; H0[j] = 0; } + } } +#ifdef MULTIPLE_THREADS + if (asl->P.hesevalth > 0 && (asl->P.sph_opts & 32) && asl->P.thused != maxit) { + printf("threads for sphes = %d\n", maxit); + fflush(stdout); + } + asl->P.thused = maxit; +#endif H = H00; if ((ulc = spi->ulcopy)) for(uli = spi->ulcend; ulc < uli; ulc += 2) diff --git a/src/solvers2/xp2known.c b/src/solvers2/xp2known.c index cb28346..7f1ca6e 100644 --- a/src/solvers2/xp2known.c +++ b/src/solvers2/xp2known.c @@ -21,6 +21,55 @@ of or in connection with the use or performance of this software. #ifdef __cplusplus extern "C" { +#endif + + typedef struct Umultinfo Umultinfo; + +#ifdef ALLOW_OPENMP +#include +#define NO_PTHREADS +#endif +#ifdef NO_PTHREADS +#define PTHREADS(x) +#define pthread_mutex_lock(t) /*nothing*/ +#define pthread_mutex_unlock(t) /*nothing*/ +#ifndef ALLOW_OPENMP +#undef MULTIPLE_THREADS +#endif +#else +#define PTHREADS(x) x +#undef MULTIPLE_THREADS +#define MULTIPLE_THREADS +#include +#endif + +#ifdef MULTIPLE_THREADS + typedef struct { + ParHvInit *tp; + int tno; + EvalWorkspace *ew; + Umultinfo *u; + range *r; + } Thparshv2; + +struct ParHvInit { + ASL_pfgh *asl; + EvalWorkspace *ew; + Ihinfo *ihi; /* current */ + Thparshv2 *tpv1; + PTHREADS(pthread_mutex_t mutex;) + PTHREADS(pthread_t* T;) + int thpv1max; /* number of tpv1 structures */ + real *p; + real *ow; + real *y; + int nobj; + real *W; + range *r; /* current */ + real align; /* make sure (ParHvInit*)+1 is properly aligned */ + }; +extern void wk_init_ASL(real*, int*, real); +extern EvalWorkspace *ewalloc4_ASL(ASL_pfgh *, EvalWorkspace *); #endif extern void dv_comp_ASL(EvalWorkspace*, int, int); @@ -110,13 +159,11 @@ xp2known_ew_ASL(EvalWorkspace *ew, real *X, fint *nerror) return rc; } - typedef struct Umultinfo Umultinfo; struct Umultinfo { Umultinfo *next; int *ov, *ov1, *ove; real *oc, *oc1; - ograd *og, *og0; int i, v; }; @@ -195,6 +242,109 @@ bigUmult(EvalWorkspace *ew, real *h, range *r, int nobj, real *ow, Umultinfo *u, } } +#ifdef MULTIPLE_THREADS + static int +hvitodo(ParHvInit *tp, Thparshv2 *tp1) +{ + Ihinfo *ihi; + range *r; + + if (!(ihi = tp->ihi)) + return 0; + if (!(r = tp->r)) { + /* starting */ + if (!(r = ihi->r)) + return 0; + goto have_r; + } + if ((r = r->rlist.prev)) + goto have_r; + if (!(ihi = tp->ihi = ihi->next) || !(r = ihi->r)) + return 0; + tp->ihi = ihi; + have_r: + tp1->r = tp->r = r; + return 1; + } + + static void* +tstart4(void *arg) +{ + ASL_pfgh *asl; + EvalWorkspace *ew; + ParHvInit *tp; + Thparshv2 *tp1; + Umultinfo *u; + Varval *V, *v; + int i, j, n1, nobj, *ov, *ove, *ui, *uie; + linarg *la, **lap, **lap1, **lape; + range *r; + real *W, *h, *oc, *ow, *s, *si, *w, *y; + + tp1 = arg; + tp = tp1->tp; + ow = tp->ow; + y = tp->y; + nobj = tp->nobj; + asl = tp->asl; + W = asl->i.Ew0->w; + if (!(ew = tp1->ew)) + tp1->ew = ew = ewalloc4_ASL(asl, tp->ew); + else if (ew->nxval != asl->i.Ew0->nxval) + memcpy(ew->w, asl->i.Ew0->w, asl->i.wlen); + w = ew->w; + V = (Varval*)w; + s = w + asl->P.dOscratch; + top: + r = tp1->r; + for(ui = r->ui, uie = ui + r->nv; ui < uie; ++ui) { + v = V + *ui; + v->aO = v->dO = v->adO = 0.; + } + h = W + r->hest; + if ((n1 = r->n) < r->nv) { + si = s; + lape = lap = r->lap; + for(i = 0; i < n1; i++) { + *si = 1.; + la = *lape++; + oc = la->oc; + ov = la->ov; + for(ove = ov + la->nnz; ov < ove; ++ov) { + v = V + *ov; + v->dO = *oc++; + v->aO = v->adO = 0.; + } + pshv_prod_ASL(ew, r, nobj, ow, y); + for(ov = la->ov; ov < ove; ++ov) + V[*ov].dO = 0.; + *si++ = 0; + lap1 = lap; + do *h++ = V[(*lap1++)->u.v].aO; + while(lap1 < lape); + } + } + else { + if (!(u = tp1->u)) + tp1->u = u = (Umultinfo*)new_mblk(asl->P.krnmax); + bigUmult(ew, h, r, nobj, ow, u, y); + } + pthread_mutex_lock(&tp->mutex); +#ifdef ALLOW_OPENMP +#pragma omp critical + { +#endif + j = hvitodo(tp,tp1); + pthread_mutex_unlock(&tp->mutex); +#ifdef ALLOW_OPENMP + } +#endif + if (j) + goto top; + return 0; + } +#endif + void hvpinit_nc_ASL(EvalWorkspace *ew, int ndhmax, int nobj, real *ow, real *y) { @@ -206,6 +356,17 @@ hvpinit_nc_ASL(EvalWorkspace *ew, int ndhmax, int nobj, real *ow, real *y) linarg *la, **lap, **lap1, **lape; range *r; real *h, *oc, *s, *si, *w; +#ifdef MULTIPLE_THREADS + ParHvInit *phvi; + Thparshv2 *tp1, *tp1e; + int j, m, maxit = 1, nrng; + PTHREADS(pthread_t *T;) + real *W; +#ifndef ALLOW_OPENMP + int it; + void *rc; +#endif +#endif asl = (ASL_pfgh*)ew->asl; ew->nhvprod = 0; @@ -220,6 +381,97 @@ hvpinit_nc_ASL(EvalWorkspace *ew, int ndhmax, int nobj, real *ow, real *y) nobj = -1; w = ew->w; V = (Varval*)w; +#ifdef MULTIPLE_THREADS + if ((m = asl->P.hesvecth) <= 0 || !(asl->P.sph_opts & 256)) + m = 0; + else { + nrng = asl->P.nran; + if (m > nrng) + m = nrng; + if ((phvi = ew->parhvinit)) { + if (m <= phvi->thpv1max) + goto have_phvi; + free(phvi); + } + ew->parhvinit = phvi = (ParHvInit*)M1alloc(sizeof(ParHvInit) + + m*(sizeof(Thparshv2) + PTHREADS(+ sizeof(pthread_t*)))); + if (!ew->hvscratch) + ew->hvscratch = (real*)M1alloc(2*n_var*sizeof(real)); + phvi->asl = asl; + phvi->ew = ew; + phvi->nobj = nobj; + phvi->W = W = asl->i.Ew0->w; + phvi->ow = ow; + phvi->y = y; + PTHREADS(pthread_mutex_init(&phvi->mutex, 0);) + phvi->thpv1max = m; + tp1 = phvi->tpv1 = (Thparshv2*)(phvi + 1); + for(j = 0; j < m; j++) { + tp1->tp = phvi; + tp1->tno = j; + tp1->ew = 0; + tp1->u = 0; + tp1++; + } + PTHREADS(phvi->T = (pthread_t*)tp1;) + phvi->tpv1->ew = ew; + if (!asl->P.krnmax) + asl->P.krnmax = htcl(asl->P.rnmax*sizeof(Umultinfo) + + n_var*sizeof(int)); + have_phvi: + PTHREADS(T = phvi->T;) + phvi->ihi = asl->P.ihi1; + tp1 = phvi->tpv1; + phvi->r = 0; +#ifdef ALLOW_OPENMP +#pragma omp parallel num_threads(m) + { + int it, jt; + Thparshv2 *tpi; + + it = omp_get_thread_num(); + tpi = tp1 + it; +#pragma omp critical + { + if ((jt = hvitodo(phvi,tpi)) && maxit <= it) + maxit = it + 1; + } + if (jt) + tstart4(tpi); + } +#else + pthread_mutex_lock(&phvi->mutex); + for(it = 1; it < m; ++it) { + tpi = tp1 + it; + if (!hvitodo(phvi,tpi)) + break; + if ((j = pthread_create(&T[it], 0, &tstart4, tpi))) + fprintf(Stderr, "hvpinit_nc_ASL: return %d from pthread_create.\n", j); + } + maxit = it; + j = hvitodo(phvi,tp1); + pthread_mutex_unlock(&phvi->mutex); + if (j) + tstart4(tp1); + while(it > 1) { + if ((j = pthread_join(T[--it], &rc))) + fprintf(Stderr, + "hvpinit_nc_ASL: return %d from pthread_join for thread %d\n", + j, it); + } +#endif + asl->P.thusedhv = maxit; + tp1 = phvi->tpv1; + for(tp1e = tp1 + m; tp1 < tp1e; ++tp1) { + if ((u = tp1->u)) { + del_mblk(u); + tp1->u = 0; + } + } + goto done; + } +#endif s = w + asl->P.dOscratch; u = 0; for(ihc = 0; ihi->ihd <= ndhmax; ihi = ihi->next) { @@ -258,7 +510,7 @@ hvpinit_nc_ASL(EvalWorkspace *ew, int ndhmax, int nobj, real *ow, real *y) asl->P.krnmax = i = htcl(asl->P.rnmax*sizeof(Umultinfo) + n_var*sizeof(int)); - u = (Umultinfo*)new_mblk(i); + u = (Umultinfo*)new_mblk(asl->P.krnmax); } bigUmult(ew, h, r, nobj, ow, u, y); } diff --git a/src/solvers2/xsum0.out b/src/solvers2/xsum0.out index 53a005e..e2df939 100644 --- a/src/solvers2/xsum0.out +++ b/src/solvers2/xsum0.out @@ -5,10 +5,10 @@ arith.h0 e206209c 2309 arith.h1 12e5e42c 1871 arith.ibm f398be2b 1391 arithchk.c 63b0185 6532 -asl.h f6271193 48517 +asl.h ff744e06 48918 asl_pfg.h 6483336 1268 asl_pfgh.h 49b5a53 1288 -asldate.c 1e5277b0 29 +asldate.c 8721c18 29 atof.c 1fabc7d3 1747 auxinfo.c 1a3c8690 1488 avltree.c 10aeaa58 11346 @@ -21,23 +21,23 @@ configure 1679ad6 1706 configurehere 1f384d7 1690 conscale.c 9a0f9de 3482 degree.c 1e74ef10 11242 -derprop.c 920dff4 1297 +derprop.c e0fec8bb 1296 details.c0 fa7ec7b3 182 -dtoa.c fd6feb77 158408 -dtoa1.c e2667fa6 3209 +dtoa.c f84c88f9 159375 +dtoa1.c f65cb7ad 3287 duthes.c 7a25db7 3935 dynlink.c f185cc89 1375 errchk.h f88aec0 1695 eval1.c 17e9a3d0 40855 eval2.c a2f88f2 60670 ewalloc1.c 112e313e 3556 -ewalloc2.c 167d9a35 5662 +ewalloc2.c 10d91ffe 5672 f_read.c 12b32457 1227 fg_read.c 5c6ded0 52351 fg_write.c ec5c75a8 27358 float.h0 11d95cda 1822 fpecatch.c e1d8a914 1574 -fpinit.c e816f8aa 5665 +fpinit.c 66e5c26 5685 fpinitmt.c f58d6f11 4947 fpsetprec.s 1d0b2972 535 fpsetprec64.s 987339e 488 @@ -52,9 +52,9 @@ funcaddr.c fc16b89a 1197 g_fmt.c 642f235 3141 genrowno.c 1a8171dc 1805 getenv.c e86c81a2 1466 -getstub.c e7c9ed06 13191 +getstub.c e9cad561 13217 getstub.h f626e91d 7079 -htcl.c f7623114 2070 +htcl.c f208317a 2066 indic_cons.c 20213d 21941 jac0dim.c 56d3270 6907 jac2dim.h 81b97e7 1982 @@ -68,8 +68,8 @@ mainexit.c e4761b9b 1713 makefile.u 12ab9d16 10301 makefile.vc ff87c3b5 7239 mip_pri.c f1dd1d47 7887 -misc.c 89b49e2 31497 -mpec_adj.c 8e59ceb 9172 +misc.c 6ad8ad2 31479 +mpec_adj.c 4061b08 9785 mpec_adj0.c efb8be74 70 mqpcheckv.c eb808a2e 28794 mypow.c f6f74614 2464 @@ -79,7 +79,7 @@ nlp.h f4e528f6 4524 nlp2.h 75d2c79 4697 nqpcheck.c e992c2f0 26321 nqpcheckZ.c ea215f98 1188 -obj_adj.c 32ac378 18019 +obj_adj.c ecf3a590 18534 obj_adj.h eed794cf 2218 obj_adj0.c ffe042be 67 obj_prec.c fd065560 1440 @@ -91,10 +91,10 @@ opcode.hd 12bc8de3 1348 opno.hd f8e46748 2123 opno2.h f82baf62 4720 opnos.hd 1be96111 120 -pfghread.c bd6e8d1 131808 +pfghread.c f3b93e8b 131968 printf.c 147de217 24293 -pshvprod.c feeb6324 49835 -psinfo.h 5a305c1 12657 +pshvprod.c f6c5f139 61858 +psinfo.h e05960fc 13888 punknown.c fa46cf10 1704 qpcheck.c 1712f297 1588 qpcheckZ.c ea5a705c 1609 @@ -107,7 +107,7 @@ rnd_prod.s f4996bed 1471 sigcatch.c fe35ba8b 2374 sjac0dim.c ff686adb 1546 sos_add.c f59f41c0 22398 -sphes.c f0404c9e 34734 +sphes.c 354e74f 54173 sprintf.c e6ce1d45 3050 sscanf.c e8e79b09 3302 stderr.c fc99b4f4 2096 @@ -121,4 +121,4 @@ wrtsol_.c 15a9faa4 2431 ws_desc.c fceb8f65 214 wsu_desc.c fbb1c30c 170 xectim.c e56314e8 3543 -xp2known.c f1b51b3d 6844 +xp2known.c 3d83d8f 11944