Actual source code: gs.c
2: /***********************************gs.c***************************************
4: Author: Henry M. Tufo III
6: e-mail: hmt@cs.brown.edu
8: snail-mail:
9: Division of Applied Mathematics
10: Brown University
11: Providence, RI 02912
13: Last Modification:
14: 6.21.97
15: ************************************gs.c**************************************/
17: /***********************************gs.c***************************************
18: File Description:
19: -----------------
21: ************************************gs.c**************************************/
23: #include <../src/ksp/pc/impls/tfs/tfs.h>
25: /* default length of number of items via tree - doubles if exceeded */
26: #define TREE_BUF_SZ 2048;
27: #define GS_VEC_SZ 1
29: /***********************************gs.c***************************************
30: Type: struct gather_scatter_id
31: ------------------------------
33: ************************************gs.c**************************************/
34: typedef struct gather_scatter_id {
35: PetscInt id;
36: PetscInt nel_min;
37: PetscInt nel_max;
38: PetscInt nel_sum;
39: PetscInt negl;
40: PetscInt gl_max;
41: PetscInt gl_min;
42: PetscInt repeats;
43: PetscInt ordered;
44: PetscInt positive;
45: PetscScalar *vals;
47: /* bit mask info */
48: PetscInt *my_proc_mask;
49: PetscInt mask_sz;
50: PetscInt *ngh_buf;
51: PetscInt ngh_buf_sz;
52: PetscInt *nghs;
53: PetscInt num_nghs;
54: PetscInt max_nghs;
55: PetscInt *pw_nghs;
56: PetscInt num_pw_nghs;
57: PetscInt *tree_nghs;
58: PetscInt num_tree_nghs;
60: PetscInt num_loads;
62: /* repeats == true -> local info */
63: PetscInt nel; /* number of unique elememts */
64: PetscInt *elms; /* of size nel */
65: PetscInt nel_total;
66: PetscInt *local_elms; /* of size nel_total */
67: PetscInt *companion; /* of size nel_total */
69: /* local info */
70: PetscInt num_local_total;
71: PetscInt local_strength;
72: PetscInt num_local;
73: PetscInt *num_local_reduce;
74: PetscInt **local_reduce;
75: PetscInt num_local_gop;
76: PetscInt *num_gop_local_reduce;
77: PetscInt **gop_local_reduce;
79: /* pairwise info */
80: PetscInt level;
81: PetscInt num_pairs;
82: PetscInt max_pairs;
83: PetscInt loc_node_pairs;
84: PetscInt max_node_pairs;
85: PetscInt min_node_pairs;
86: PetscInt avg_node_pairs;
87: PetscInt *pair_list;
88: PetscInt *msg_sizes;
89: PetscInt **node_list;
90: PetscInt len_pw_list;
91: PetscInt *pw_elm_list;
92: PetscScalar *pw_vals;
94: MPI_Request *msg_ids_in;
95: MPI_Request *msg_ids_out;
97: PetscScalar *out;
98: PetscScalar *in;
99: PetscInt msg_total;
101: /* tree - crystal accumulator info */
102: PetscInt max_left_over;
103: PetscInt *pre;
104: PetscInt *in_num;
105: PetscInt *out_num;
106: PetscInt **in_list;
107: PetscInt **out_list;
109: /* new tree work*/
110: PetscInt tree_nel;
111: PetscInt *tree_elms;
112: PetscScalar *tree_buf;
113: PetscScalar *tree_work;
115: PetscInt tree_map_sz;
116: PetscInt *tree_map_in;
117: PetscInt *tree_map_out;
119: /* current memory status */
120: PetscInt gl_bss_min;
121: PetscInt gl_perm_min;
123: /* max segment size for PCTFS_gs_gop_vec() */
124: PetscInt vec_sz;
126: /* hack to make paul happy */
127: MPI_Comm PCTFS_gs_comm;
129: } PCTFS_gs_id;
131: static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
132: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
133: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
134: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
135: static PCTFS_gs_id *gsi_new(void);
136: static PetscErrorCode set_tree(PCTFS_gs_id *gs);
138: /* same for all but vector flavor */
139: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
140: /* vector flavor */
141: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
143: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
145: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
147: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
149: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
150: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
152: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
154: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
156: /* global vars */
157: /* from comm.c module */
159: static PetscInt num_gs_ids = 0;
161: /* should make this dynamic ... later */
162: static PetscInt msg_buf =MAX_MSG_BUF;
163: static PetscInt vec_sz =GS_VEC_SZ;
164: static PetscInt *tree_buf =NULL;
165: static PetscInt tree_buf_sz=0;
166: static PetscInt ntree =0;
168: /***************************************************************************/
169: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
170: {
172: vec_sz = size;
173: return(0);
174: }
176: /******************************************************************************/
177: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
178: {
180: msg_buf = buf_size;
181: return(0);
182: }
184: /******************************************************************************/
185: PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
186: {
187: PCTFS_gs_id *gs;
188: MPI_Group PCTFS_gs_group;
189: MPI_Comm PCTFS_gs_comm;
192: /* ensure that communication package has been initialized */
193: PCTFS_comm_init();
195: /* determines if we have enough dynamic/semi-static memory */
196: /* checks input, allocs and sets gd_id template */
197: gs = gsi_check_args(elms,nel,level);
199: /* only bit mask version up and working for the moment */
200: /* LATER :: get int list version working for sparse pblms */
201: gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
203: MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
204: MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);
205: MPI_Group_free(&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
207: gs->PCTFS_gs_comm=PCTFS_gs_comm;
209: return(gs);
210: }
212: /******************************************************************************/
213: static PCTFS_gs_id *gsi_new(void)
214: {
216: PCTFS_gs_id *gs;
217: gs = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id));
218: PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
219: return(gs);
220: }
222: /******************************************************************************/
223: static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
224: {
225: PetscInt i, j, k, t2;
226: PetscInt *companion, *elms, *unique, *iptr;
227: PetscInt num_local=0, *num_to_reduce, **local_reduce;
228: PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
229: PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1];
230: PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1];
231: PCTFS_gs_id *gs;
234: if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
235: if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");
237: if (nel==0) { PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }
239: /* get space for gs template */
240: gs = gsi_new();
241: gs->id = ++num_gs_ids;
243: /* hmt 6.4.99 */
244: /* caller can set global ids that don't participate to 0 */
245: /* PCTFS_gs_init ignores all zeros in elm list */
246: /* negative global ids are still invalid */
247: for (i=j=0; i<nel; i++) {
248: if (in_elms[i]!=0) j++;
249: }
251: k=nel; nel=j;
253: /* copy over in_elms list and create inverse map */
254: elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
255: companion = (PetscInt*) malloc(nel*sizeof(PetscInt));
257: for (i=j=0; i<k; i++) {
258: if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; }
259: }
261: if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");
263: /* pre-pass ... check to see if sorted */
264: elms[nel] = INT_MAX;
265: iptr = elms;
266: unique = elms+1;
267: j =0;
268: while (*iptr!=INT_MAX) {
269: if (*iptr++>*unique++) { j=1; break; }
270: }
272: /* set up inverse map */
273: if (j) {
274: PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
275: PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
276: } else { PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }
277: elms[nel] = INT_MIN;
279: /* first pass */
280: /* determine number of unique elements, check pd */
281: for (i=k=0; i<nel; i+=j) {
282: t2 = elms[i];
283: j = ++i;
285: /* clump 'em for now */
286: while (elms[j]==t2) j++;
288: /* how many together and num local */
289: if (j-=i) { num_local++; k+=j; }
290: }
292: /* how many unique elements? */
293: gs->repeats = k;
294: gs->nel = nel-k;
296: /* number of repeats? */
297: gs->num_local = num_local;
298: num_local += 2;
299: gs->local_reduce = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*));
300: gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));
302: unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
303: gs->elms = unique;
304: gs->nel_total = nel;
305: gs->local_elms = elms;
306: gs->companion = companion;
308: /* compess map as well as keep track of local ops */
309: for (num_local=i=j=0; i<gs->nel; i++) {
310: k = j;
311: t2 = unique[i] = elms[j];
312: companion[i] = companion[j];
314: while (elms[j]==t2) j++;
316: if ((t2=(j-k))>1) {
317: /* number together */
318: num_to_reduce[num_local] = t2++;
320: iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));
322: /* to use binary searching don't remap until we check intersection */
323: *iptr++ = i;
325: /* note that we're skipping the first one */
326: while (++k<j) *(iptr++) = companion[k];
327: *iptr = -1;
328: }
329: }
331: /* sentinel for ngh_buf */
332: unique[gs->nel]=INT_MAX;
334: /* for two partition sort hack */
335: num_to_reduce[num_local] = 0;
336: local_reduce[num_local] = NULL;
337: num_to_reduce[++num_local] = 0;
338: local_reduce[num_local] = NULL;
340: /* load 'em up */
341: /* note one extra to hold NON_UNIFORM flag!!! */
342: vals[2] = vals[1] = vals[0] = nel;
343: if (gs->nel>0) {
344: vals[3] = unique[0];
345: vals[4] = unique[gs->nel-1];
346: } else {
347: vals[3] = INT_MAX;
348: vals[4] = INT_MIN;
349: }
350: vals[5] = level;
351: vals[6] = num_gs_ids;
353: /* GLOBAL: send 'em out */
354: PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
356: /* must be semi-pos def - only pairwise depends on this */
357: /* LATER - remove this restriction */
358: if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
359: if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");
361: gs->nel_min = vals[0];
362: gs->nel_max = vals[1];
363: gs->nel_sum = vals[2];
364: gs->gl_min = vals[3];
365: gs->gl_max = vals[4];
366: gs->negl = vals[4]-vals[3]+1;
368: if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");
370: /* LATER :: add level == -1 -> program selects level */
371: if (vals[5]<0) vals[5]=0;
372: else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes;
373: gs->level = vals[5];
375: return(gs);
376: }
378: /******************************************************************************/
379: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
380: {
381: PetscInt i, nel, *elms;
382: PetscInt t1;
383: PetscInt **reduce;
384: PetscInt *map;
388: /* totally local removes ... PCTFS_ct_bits == 0 */
389: get_ngh_buf(gs);
391: if (gs->level) set_pairwise(gs);
392: if (gs->max_left_over) set_tree(gs);
394: /* intersection local and pairwise/tree? */
395: gs->num_local_total = gs->num_local;
396: gs->gop_local_reduce = gs->local_reduce;
397: gs->num_gop_local_reduce = gs->num_local_reduce;
399: map = gs->companion;
401: /* is there any local compression */
402: if (!gs->num_local) {
403: gs->local_strength = NONE;
404: gs->num_local_gop = 0;
405: } else {
406: /* ok find intersection */
407: map = gs->companion;
408: reduce = gs->local_reduce;
409: for (i=0, t1=0; i<gs->num_local; i++, reduce++) {
410: if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) || PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) {
411: t1++;
412: if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?");
413: gs->num_local_reduce[i] *= -1;
414: }
415: **reduce=map[**reduce];
416: }
418: /* intersection is empty */
419: if (!t1) {
420: gs->local_strength = FULL;
421: gs->num_local_gop = 0;
422: } else { /* intersection not empty */
423: gs->local_strength = PARTIAL;
425: PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);
427: gs->num_local_gop = t1;
428: gs->num_local_total = gs->num_local;
429: gs->num_local -= t1;
430: gs->gop_local_reduce = gs->local_reduce;
431: gs->num_gop_local_reduce = gs->num_local_reduce;
433: for (i=0; i<t1; i++) {
434: if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?");
435: gs->num_gop_local_reduce[i] *= -1;
436: gs->local_reduce++;
437: gs->num_local_reduce++;
438: }
439: gs->local_reduce++;
440: gs->num_local_reduce++;
441: }
442: }
444: elms = gs->pw_elm_list;
445: nel = gs->len_pw_list;
446: for (i=0; i<nel; i++) elms[i] = map[elms[i]];
448: elms = gs->tree_map_in;
449: nel = gs->tree_map_sz;
450: for (i=0; i<nel; i++) elms[i] = map[elms[i]];
452: /* clean up */
453: free((void*) gs->local_elms);
454: free((void*) gs->companion);
455: free((void*) gs->elms);
456: free((void*) gs->ngh_buf);
457: gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
458: return(0);
459: }
461: /******************************************************************************/
462: static PetscErrorCode place_in_tree(PetscInt elm)
463: {
464: PetscInt *tp, n;
467: if (ntree==tree_buf_sz) {
468: if (tree_buf_sz) {
469: tp = tree_buf;
470: n = tree_buf_sz;
471: tree_buf_sz<<=1;
472: tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
473: PCTFS_ivec_copy(tree_buf,tp,n);
474: free(tp);
475: } else {
476: tree_buf_sz = TREE_BUF_SZ;
477: tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
478: }
479: }
481: tree_buf[ntree++] = elm;
482: return(0);
483: }
485: /******************************************************************************/
486: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
487: {
488: PetscInt i, j, npw=0, ntree_map=0;
489: PetscInt p_mask_size, ngh_buf_size, buf_size;
490: PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
491: PetscInt *ngh_buf, *buf1, *buf2;
492: PetscInt offset, per_load, num_loads, or_ct, start, end;
493: PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
494: PetscInt oper=GL_B_OR;
495: PetscInt *ptr3, *t_mask, level, ct1, ct2;
499: /* to make life easier */
500: nel = gs->nel;
501: elms = gs->elms;
502: level = gs->level;
504: /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
505: p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
506: PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);
508: /* allocate space for masks and info bufs */
509: gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
510: gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
511: gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
512: t_mask = (PetscInt*) malloc(p_mask_size);
513: gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size);
515: /* comm buffer size ... memory usage bounded by ~2*msg_buf */
516: /* had thought I could exploit rendezvous threshold */
518: /* default is one pass */
519: per_load = negl = gs->negl;
520: gs->num_loads = num_loads = 1;
521: i = p_mask_size*negl;
523: /* possible overflow on buffer size */
524: /* overflow hack */
525: if (i<0) i=INT_MAX;
527: buf_size = PetscMin(msg_buf,i);
529: /* can we do it? */
530: if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);
532: /* get PCTFS_giop buf space ... make *only* one malloc */
533: buf1 = (PetscInt*) malloc(buf_size<<1);
535: /* more than one gior exchange needed? */
536: if (buf_size!=i) {
537: per_load = buf_size/p_mask_size;
538: buf_size = per_load*p_mask_size;
539: gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
540: }
542: /* convert buf sizes from #bytes to #ints - 32 bit only! */
543: p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
545: /* find PCTFS_giop work space */
546: buf2 = buf1+buf_size;
548: /* hold #ints needed for processor masks */
549: gs->mask_sz=p_mask_size;
551: /* init buffers */
552: PCTFS_ivec_zero(sh_proc_mask,p_mask_size);
553: PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);
554: PCTFS_ivec_zero(ngh_buf,ngh_buf_size);
556: /* HACK reset tree info */
557: tree_buf = NULL;
558: tree_buf_sz = ntree = 0;
560: /* ok do it */
561: for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) {
562: /* identity for bitwise or is 000...000 */
563: PCTFS_ivec_zero(buf1,buf_size);
565: /* load msg buffer */
566: for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) {
567: offset = (offset-start)*p_mask_size;
568: PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
569: }
571: /* GLOBAL: pass buffer */
572: PCTFS_giop(buf1,buf2,buf_size,&oper);
574: /* unload buffer into ngh_buf */
575: ptr2=(elms+i_start);
576: for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) {
577: /* I own it ... may have to pairwise it */
578: if (j==*ptr2) {
579: /* do i share it w/anyone? */
580: ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
581: /* guess not */
582: if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; }
584: /* i do ... so keep info and turn off my bit */
585: PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
586: PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);
587: PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);
589: /* is it to be done pairwise? */
590: if (--ct1<=level) {
591: npw++;
593: /* turn on high bit to indicate pw need to process */
594: *ptr2++ |= TOP_BIT;
595: PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);
596: ptr1 += p_mask_size;
597: continue;
598: }
600: /* get set for next and note that I have a tree contribution */
601: /* could save exact elm index for tree here -> save a search */
602: ptr2++; ptr1+=p_mask_size; ntree_map++;
603: } else { /* i don't but still might be involved in tree */
605: /* shared by how many? */
606: ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
608: /* none! */
609: if (ct1<2) continue;
611: /* is it going to be done pairwise? but not by me of course!*/
612: if (--ct1<=level) continue;
613: }
614: /* LATER we're going to have to process it NOW */
615: /* nope ... tree it */
616: place_in_tree(j);
617: }
618: }
620: free((void*)t_mask);
621: free((void*)buf1);
623: gs->len_pw_list = npw;
624: gs->num_nghs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));
626: /* expand from bit mask list to int list and save ngh list */
627: gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
628: PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);
630: gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));
632: oper = GL_MAX;
633: ct1 = gs->num_nghs;
634: PCTFS_giop(&ct1,&ct2,1,&oper);
635: gs->max_nghs = ct1;
637: gs->tree_map_sz = ntree_map;
638: gs->max_left_over=ntree;
640: free((void*)p_mask);
641: free((void*)sh_proc_mask);
642: return(0);
643: }
645: /******************************************************************************/
646: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
647: {
648: PetscInt i, j;
649: PetscInt p_mask_size;
650: PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
651: PetscInt *ngh_buf, *buf2;
652: PetscInt offset;
653: PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
654: PetscInt *pairwise_elm_list, len_pair_list=0;
655: PetscInt *iptr, t1, i_start, nel, *elms;
656: PetscInt ct;
660: /* to make life easier */
661: nel = gs->nel;
662: elms = gs->elms;
663: ngh_buf = gs->ngh_buf;
664: sh_proc_mask = gs->pw_nghs;
666: /* need a few temp masks */
667: p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes);
668: p_mask = (PetscInt*) malloc(p_mask_size);
669: tmp_proc_mask = (PetscInt*) malloc(p_mask_size);
671: /* set mask to my PCTFS_my_id's bit mask */
672: PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);
674: p_mask_size /= sizeof(PetscInt);
676: len_pair_list = gs->len_pw_list;
677: gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));
679: /* how many processors (nghs) do we have to exchange with? */
680: nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));
682: /* allocate space for PCTFS_gs_gop() info */
683: gs->pair_list = msg_list = (PetscInt*) malloc(sizeof(PetscInt)*nprs);
684: gs->msg_sizes = msg_size = (PetscInt*) malloc(sizeof(PetscInt)*nprs);
685: gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1));
687: /* init msg_size list */
688: PCTFS_ivec_zero(msg_size,nprs);
690: /* expand from bit mask list to int list */
691: PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);
693: /* keep list of elements being handled pairwise */
694: for (i=j=0; i<nel; i++) {
695: if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; }
696: }
697: pairwise_elm_list[j] = -1;
699: gs->msg_ids_out = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1));
700: gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
701: gs->msg_ids_in = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1));
702: gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
703: gs->pw_vals = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);
705: /* find who goes to each processor */
706: for (i_start=i=0; i<nprs; i++) {
707: /* processor i's mask */
708: PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);
710: /* det # going to processor i */
711: for (ct=j=0; j<len_pair_list; j++) {
712: buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
713: PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
714: if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++;
715: }
716: msg_size[i] = ct;
717: i_start = PetscMax(i_start,ct);
719: /*space to hold nodes in message to first neighbor */
720: msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));
722: for (j=0;j<len_pair_list;j++) {
723: buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
724: PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
725: if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j;
726: }
727: *iptr = -1;
728: }
729: msg_nodes[nprs] = NULL;
731: j = gs->loc_node_pairs=i_start;
732: t1 = GL_MAX;
733: PCTFS_giop(&i_start,&offset,1,&t1);
734: gs->max_node_pairs = i_start;
736: i_start = j;
737: t1 = GL_MIN;
738: PCTFS_giop(&i_start,&offset,1,&t1);
739: gs->min_node_pairs = i_start;
741: i_start = j;
742: t1 = GL_ADD;
743: PCTFS_giop(&i_start,&offset,1,&t1);
744: gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;
746: i_start = nprs;
747: t1 = GL_MAX;
748: PCTFS_giop(&i_start,&offset,1,&t1);
749: gs->max_pairs = i_start;
751: /* remap pairwise in tail of gsi_via_bit_mask() */
752: gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
753: gs->out = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
754: gs->in = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
756: /* reset malloc pool */
757: free((void*)p_mask);
758: free((void*)tmp_proc_mask);
759: return(0);
760: }
762: /* to do pruned tree just save ngh buf copy for each one and decode here!
763: ******************************************************************************/
764: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
765: {
766: PetscInt i, j, n, nel;
767: PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
770: /* local work ptrs */
771: elms = gs->elms;
772: nel = gs->nel;
774: /* how many via tree */
775: gs->tree_nel = n = ntree;
776: gs->tree_elms = tree_elms = iptr_in = tree_buf;
777: gs->tree_buf = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
778: gs->tree_work = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
779: j = gs->tree_map_sz;
780: gs->tree_map_in = iptr_in = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
781: gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
783: /* search the longer of the two lists */
784: /* note ... could save this info in get_ngh_buf and save searches */
785: if (n<=nel) {
786: /* bijective fct w/remap - search elm list */
787: for (i=0; i<n; i++) {
788: if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;}
789: }
790: } else {
791: for (i=0; i<nel; i++) {
792: if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;}
793: }
794: }
796: /* sentinel */
797: *iptr_in = *iptr_out = -1;
798: return(0);
799: }
801: /******************************************************************************/
802: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
803: {
804: PetscInt *num, *map, **reduce;
805: PetscScalar tmp;
808: num = gs->num_gop_local_reduce;
809: reduce = gs->gop_local_reduce;
810: while ((map = *reduce++)) {
811: /* wall */
812: if (*num == 2) {
813: num++;
814: vals[map[1]] = vals[map[0]];
815: } else if (*num == 3) { /* corner shared by three elements */
816: num++;
817: vals[map[2]] = vals[map[1]] = vals[map[0]];
818: } else if (*num == 4) { /* corner shared by four elements */
819: num++;
820: vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
821: } else { /* general case ... odd geoms ... 3D*/
822: num++;
823: tmp = *(vals + *map++);
824: while (*map >= 0) *(vals + *map++) = tmp;
825: }
826: }
827: return(0);
828: }
830: /******************************************************************************/
831: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
832: {
833: PetscInt *num, *map, **reduce;
834: PetscScalar tmp;
837: num = gs->num_local_reduce;
838: reduce = gs->local_reduce;
839: while ((map = *reduce)) {
840: /* wall */
841: if (*num == 2) {
842: num++; reduce++;
843: vals[map[1]] = vals[map[0]] += vals[map[1]];
844: } else if (*num == 3) { /* corner shared by three elements */
845: num++; reduce++;
846: vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
847: } else if (*num == 4) { /* corner shared by four elements */
848: num++; reduce++;
849: vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
850: } else { /* general case ... odd geoms ... 3D*/
851: num++;
852: tmp = 0.0;
853: while (*map >= 0) tmp += *(vals + *map++);
855: map = *reduce++;
856: while (*map >= 0) *(vals + *map++) = tmp;
857: }
858: }
859: return(0);
860: }
862: /******************************************************************************/
863: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
864: {
865: PetscInt *num, *map, **reduce;
866: PetscScalar *base;
869: num = gs->num_gop_local_reduce;
870: reduce = gs->gop_local_reduce;
871: while ((map = *reduce++)) {
872: /* wall */
873: if (*num == 2) {
874: num++;
875: vals[map[0]] += vals[map[1]];
876: } else if (*num == 3) { /* corner shared by three elements */
877: num++;
878: vals[map[0]] += (vals[map[1]] + vals[map[2]]);
879: } else if (*num == 4) { /* corner shared by four elements */
880: num++;
881: vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
882: } else { /* general case ... odd geoms ... 3D*/
883: num++;
884: base = vals + *map++;
885: while (*map >= 0) *base += *(vals + *map++);
886: }
887: }
888: return(0);
889: }
891: /******************************************************************************/
892: PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
893: {
894: PetscInt i;
898: MPI_Comm_free(&gs->PCTFS_gs_comm);
899: if (gs->nghs) free((void*) gs->nghs);
900: if (gs->pw_nghs) free((void*) gs->pw_nghs);
902: /* tree */
903: if (gs->max_left_over) {
904: if (gs->tree_elms) free((void*) gs->tree_elms);
905: if (gs->tree_buf) free((void*) gs->tree_buf);
906: if (gs->tree_work) free((void*) gs->tree_work);
907: if (gs->tree_map_in) free((void*) gs->tree_map_in);
908: if (gs->tree_map_out) free((void*) gs->tree_map_out);
909: }
911: /* pairwise info */
912: if (gs->num_pairs) {
913: /* should be NULL already */
914: if (gs->ngh_buf) free((void*) gs->ngh_buf);
915: if (gs->elms) free((void*) gs->elms);
916: if (gs->local_elms) free((void*) gs->local_elms);
917: if (gs->companion) free((void*) gs->companion);
919: /* only set if pairwise */
920: if (gs->vals) free((void*) gs->vals);
921: if (gs->in) free((void*) gs->in);
922: if (gs->out) free((void*) gs->out);
923: if (gs->msg_ids_in) free((void*) gs->msg_ids_in);
924: if (gs->msg_ids_out) free((void*) gs->msg_ids_out);
925: if (gs->pw_vals) free((void*) gs->pw_vals);
926: if (gs->pw_elm_list) free((void*) gs->pw_elm_list);
927: if (gs->node_list) {
928: for (i=0;i<gs->num_pairs;i++) {
929: if (gs->node_list[i]) {
930: free((void*) gs->node_list[i]);
931: }
932: }
933: free((void*) gs->node_list);
934: }
935: if (gs->msg_sizes) free((void*) gs->msg_sizes);
936: if (gs->pair_list) free((void*) gs->pair_list);
937: }
939: /* local info */
940: if (gs->num_local_total>=0) {
941: for (i=0;i<gs->num_local_total+1;i++) {
942: if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]);
943: }
944: }
946: /* if intersection tree/pairwise and local isn't empty */
947: if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce);
948: if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce);
950: free((void*) gs);
951: return(0);
952: }
954: /******************************************************************************/
955: PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
956: {
960: switch (*op) {
961: case '+':
962: PCTFS_gs_gop_vec_plus(gs,vals,step);
963: break;
964: default:
965: PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0]);
966: PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n");
967: PCTFS_gs_gop_vec_plus(gs,vals,step);
968: break;
969: }
970: return(0);
971: }
973: /******************************************************************************/
974: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
975: {
977: if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!");
979: /* local only operations!!! */
980: if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step);
982: /* if intersection tree/pairwise and local isn't empty */
983: if (gs->num_local_gop) {
984: PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);
986: /* pairwise */
987: if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);
989: /* tree */
990: else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
992: PCTFS_gs_gop_vec_local_out(gs,vals,step);
993: } else { /* if intersection tree/pairwise and local is empty */
994: /* pairwise */
995: if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);
997: /* tree */
998: else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
999: }
1000: return(0);
1001: }
1003: /******************************************************************************/
1004: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1005: {
1006: PetscInt *num, *map, **reduce;
1007: PetscScalar *base;
1010: num = gs->num_local_reduce;
1011: reduce = gs->local_reduce;
1012: while ((map = *reduce)) {
1013: base = vals + map[0] * step;
1015: /* wall */
1016: if (*num == 2) {
1017: num++; reduce++;
1018: PCTFS_rvec_add (base,vals+map[1]*step,step);
1019: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1020: } else if (*num == 3) { /* corner shared by three elements */
1021: num++; reduce++;
1022: PCTFS_rvec_add (base,vals+map[1]*step,step);
1023: PCTFS_rvec_add (base,vals+map[2]*step,step);
1024: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1025: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1026: } else if (*num == 4) { /* corner shared by four elements */
1027: num++; reduce++;
1028: PCTFS_rvec_add (base,vals+map[1]*step,step);
1029: PCTFS_rvec_add (base,vals+map[2]*step,step);
1030: PCTFS_rvec_add (base,vals+map[3]*step,step);
1031: PCTFS_rvec_copy(vals+map[3]*step,base,step);
1032: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1033: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1034: } else { /* general case ... odd geoms ... 3D */
1035: num++;
1036: while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step);
1038: map = *reduce;
1039: while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1041: reduce++;
1042: }
1043: }
1044: return(0);
1045: }
1047: /******************************************************************************/
1048: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1049: {
1050: PetscInt *num, *map, **reduce;
1051: PetscScalar *base;
1054: num = gs->num_gop_local_reduce;
1055: reduce = gs->gop_local_reduce;
1056: while ((map = *reduce++)) {
1057: base = vals + map[0] * step;
1059: /* wall */
1060: if (*num == 2) {
1061: num++;
1062: PCTFS_rvec_add(base,vals+map[1]*step,step);
1063: } else if (*num == 3) { /* corner shared by three elements */
1064: num++;
1065: PCTFS_rvec_add(base,vals+map[1]*step,step);
1066: PCTFS_rvec_add(base,vals+map[2]*step,step);
1067: } else if (*num == 4) { /* corner shared by four elements */
1068: num++;
1069: PCTFS_rvec_add(base,vals+map[1]*step,step);
1070: PCTFS_rvec_add(base,vals+map[2]*step,step);
1071: PCTFS_rvec_add(base,vals+map[3]*step,step);
1072: } else { /* general case ... odd geoms ... 3D*/
1073: num++;
1074: while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step);
1075: }
1076: }
1077: return(0);
1078: }
1080: /******************************************************************************/
1081: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1082: {
1083: PetscInt *num, *map, **reduce;
1084: PetscScalar *base;
1087: num = gs->num_gop_local_reduce;
1088: reduce = gs->gop_local_reduce;
1089: while ((map = *reduce++)) {
1090: base = vals + map[0] * step;
1092: /* wall */
1093: if (*num == 2) {
1094: num++;
1095: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1096: } else if (*num == 3) { /* corner shared by three elements */
1097: num++;
1098: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1099: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1100: } else if (*num == 4) { /* corner shared by four elements */
1101: num++;
1102: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1103: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1104: PCTFS_rvec_copy(vals+map[3]*step,base,step);
1105: } else { /* general case ... odd geoms ... 3D*/
1106: num++;
1107: while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1108: }
1109: }
1110: return(0);
1111: }
1113: /******************************************************************************/
1114: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1115: {
1116: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1117: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1118: PetscInt *pw, *list, *size, **nodes;
1119: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1120: MPI_Status status;
1121: PetscBLASInt i1 = 1,dstep;
1125: /* strip and load s */
1126: msg_list = list = gs->pair_list;
1127: msg_size = size = gs->msg_sizes;
1128: msg_nodes = nodes = gs->node_list;
1129: iptr = pw = gs->pw_elm_list;
1130: dptr1 = dptr3 = gs->pw_vals;
1131: msg_ids_in = ids_in = gs->msg_ids_in;
1132: msg_ids_out = ids_out = gs->msg_ids_out;
1133: dptr2 = gs->out;
1134: in1=in2 = gs->in;
1136: /* post the receives */
1137: /* msg_nodes=nodes; */
1138: do {
1139: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1140: second one *list and do list++ afterwards */
1141: MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1142: list++;msg_ids_in++;
1143: in1 += *size++ *step;
1144: } while (*++msg_nodes);
1145: msg_nodes=nodes;
1147: /* load gs values into in out gs buffers */
1148: while (*iptr >= 0) {
1149: PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1150: dptr3+=step;
1151: iptr++;
1152: }
1154: /* load out buffers and post the sends */
1155: while ((iptr = *msg_nodes++)) {
1156: dptr3 = dptr2;
1157: while (*iptr >= 0) {
1158: PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1159: dptr2+=step;
1160: iptr++;
1161: }
1162: MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1163: msg_size++; msg_list++;msg_ids_out++;
1164: }
1166: /* tree */
1167: if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);
1169: /* process the received data */
1170: msg_nodes=nodes;
1171: while ((iptr = *nodes++)) {
1172: PetscScalar d1 = 1.0;
1174: /* Should I check the return value of MPI_Wait() or status? */
1175: /* Can this loop be replaced by a call to MPI_Waitall()? */
1176: MPI_Wait(ids_in, &status);
1177: ids_in++;
1178: while (*iptr >= 0) {
1179: PetscBLASIntCast(step,&dstep);
1180: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1));
1181: in2+=step;
1182: iptr++;
1183: }
1184: }
1186: /* replace vals */
1187: while (*pw >= 0) {
1188: PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1189: dptr1+=step;
1190: pw++;
1191: }
1193: /* clear isend message handles */
1194: /* This changed for clarity though it could be the same */
1196: /* Should I check the return value of MPI_Wait() or status? */
1197: /* Can this loop be replaced by a call to MPI_Waitall()? */
1198: while (*msg_nodes++) {
1199: MPI_Wait(ids_out, &status);
1200: ids_out++;
1201: }
1202: return(0);
1203: }
1205: /******************************************************************************/
1206: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1207: {
1208: PetscInt size, *in, *out;
1209: PetscScalar *buf, *work;
1210: PetscInt op[] = {GL_ADD,0};
1211: PetscBLASInt i1 = 1;
1213: PetscBLASInt dstep;
1216: /* copy over to local variables */
1217: in = gs->tree_map_in;
1218: out = gs->tree_map_out;
1219: buf = gs->tree_buf;
1220: work = gs->tree_work;
1221: size = gs->tree_nel*step;
1223: /* zero out collection buffer */
1224: PCTFS_rvec_zero(buf,size);
1226: /* copy over my contributions */
1227: while (*in >= 0) {
1228: PetscBLASIntCast(step,&dstep);
1229: PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1));
1230: }
1232: /* perform fan in/out on full buffer */
1233: /* must change PCTFS_grop to handle the blas */
1234: PCTFS_grop(buf,work,size,op);
1236: /* reset */
1237: in = gs->tree_map_in;
1238: out = gs->tree_map_out;
1240: /* get the portion of the results I need */
1241: while (*in >= 0) {
1242: PetscBLASIntCast(step,&dstep);
1243: PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1));
1244: }
1245: return(0);
1246: }
1248: /******************************************************************************/
1249: PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1250: {
1254: switch (*op) {
1255: case '+':
1256: PCTFS_gs_gop_plus_hc(gs,vals,dim);
1257: break;
1258: default:
1259: PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0]);
1260: PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");
1261: PCTFS_gs_gop_plus_hc(gs,vals,dim);
1262: break;
1263: }
1264: return(0);
1265: }
1267: /******************************************************************************/
1268: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1269: {
1271: /* if there's nothing to do return */
1272: if (dim<=0) return(0);
1274: /* can't do more dimensions then exist */
1275: dim = PetscMin(dim,PCTFS_i_log2_num_nodes);
1277: /* local only operations!!! */
1278: if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals);
1280: /* if intersection tree/pairwise and local isn't empty */
1281: if (gs->num_local_gop) {
1282: PCTFS_gs_gop_local_in_plus(gs,vals);
1284: /* pairwise will do tree inside ... */
1285: if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */
1286: else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1288: PCTFS_gs_gop_local_out(gs,vals);
1289: } else { /* if intersection tree/pairwise and local is empty */
1290: /* pairwise will do tree inside */
1291: if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */
1292: else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1293: }
1294: return(0);
1295: }
1297: /******************************************************************************/
1298: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1299: {
1300: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1301: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1302: PetscInt *pw, *list, *size, **nodes;
1303: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1304: MPI_Status status;
1305: PetscInt i, mask=1;
1309: for (i=1; i<dim; i++) { mask<<=1; mask++; }
1311: /* strip and load s */
1312: msg_list = list = gs->pair_list;
1313: msg_size = size = gs->msg_sizes;
1314: msg_nodes = nodes = gs->node_list;
1315: iptr = pw = gs->pw_elm_list;
1316: dptr1 = dptr3 = gs->pw_vals;
1317: msg_ids_in = ids_in = gs->msg_ids_in;
1318: msg_ids_out = ids_out = gs->msg_ids_out;
1319: dptr2 = gs->out;
1320: in1 = in2 = gs->in;
1322: /* post the receives */
1323: /* msg_nodes=nodes; */
1324: do {
1325: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1326: second one *list and do list++ afterwards */
1327: if ((PCTFS_my_id|mask)==(*list|mask)) {
1328: MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1329: list++; msg_ids_in++;in1 += *size++;
1330: } else { list++; size++; }
1331: } while (*++msg_nodes);
1333: /* load gs values into in out gs buffers */
1334: while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1336: /* load out buffers and post the sends */
1337: msg_nodes=nodes;
1338: list = msg_list;
1339: while ((iptr = *msg_nodes++)) {
1340: if ((PCTFS_my_id|mask)==(*list|mask)) {
1341: dptr3 = dptr2;
1342: while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1343: /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1344: /* is msg_ids_out++ correct? */
1345: MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1346: msg_size++;list++;msg_ids_out++;
1347: } else {list++; msg_size++;}
1348: }
1350: /* do the tree while we're waiting */
1351: if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);
1353: /* process the received data */
1354: msg_nodes=nodes;
1355: list = msg_list;
1356: while ((iptr = *nodes++)) {
1357: if ((PCTFS_my_id|mask)==(*list|mask)) {
1358: /* Should I check the return value of MPI_Wait() or status? */
1359: /* Can this loop be replaced by a call to MPI_Waitall()? */
1360: MPI_Wait(ids_in, &status);
1361: ids_in++;
1362: while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1363: }
1364: list++;
1365: }
1367: /* replace vals */
1368: while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1370: /* clear isend message handles */
1371: /* This changed for clarity though it could be the same */
1372: while (*msg_nodes++) {
1373: if ((PCTFS_my_id|mask)==(*msg_list|mask)) {
1374: /* Should I check the return value of MPI_Wait() or status? */
1375: /* Can this loop be replaced by a call to MPI_Waitall()? */
1376: MPI_Wait(ids_out, &status);
1377: ids_out++;
1378: }
1379: msg_list++;
1380: }
1381: return(0);
1382: }
1384: /******************************************************************************/
1385: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1386: {
1387: PetscInt size;
1388: PetscInt *in, *out;
1389: PetscScalar *buf, *work;
1390: PetscInt op[] = {GL_ADD,0};
1393: in = gs->tree_map_in;
1394: out = gs->tree_map_out;
1395: buf = gs->tree_buf;
1396: work = gs->tree_work;
1397: size = gs->tree_nel;
1399: PCTFS_rvec_zero(buf,size);
1401: while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1403: in = gs->tree_map_in;
1404: out = gs->tree_map_out;
1406: PCTFS_grop_hc(buf,work,size,op,dim);
1408: while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1409: return(0);
1410: }