Actual source code: xmllogevent.c
1: /*************************************************************************************
2: * M A R I T I M E R E S E A R C H I N S T I T U T E N E T H E R L A N D S *
3: *************************************************************************************
4: * authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij *
5: *************************************************************************************
6: * content: Support for nested PetscTimers *
7: *************************************************************************************/
8: #include <petsclog.h>
9: #include <petsc/private/logimpl.h>
10: #include <petsctime.h>
11: #include <petscviewer.h>
12: #include "../src/sys/logging/xmlviewer.h"
14: #if defined(PETSC_USE_LOG)
16: /*
17: * Support for nested PetscTimers
18: *
19: * PetscTimers keep track of a lot of useful information: Wall clock times,
20: * message passing statistics, flop counts. Information about the nested structure
21: * of the timers is lost. Example:
22: *
23: * 7:30 Start: awake
24: * 7:30 Start: morning routine
25: * 7:40 Start: eat
26: * 7:49 Done: eat
27: * 7:43 Done: morning routine
28: * 8:15 Start: work
29: * 12:15 Start: eat
30: * 12:45 Done: eat
31: * 16:00 Done: work
32: * 16:30 Start: evening routine
33: * 18:30 Start: eat
34: * 19:15 Done: eat
35: * 22:00 Done: evening routine
36: * 22:00 Done: awake
37: *
38: * Petsc timers provide the following timer results:
39: *
40: * awake: 1 call 14:30 hours
41: * morning routine: 1 call 0:13 hours
42: * eat: 3 calls 1:24 hours
43: * work: 1 call 7:45 hours
44: * evening routine 1 call 5:30 hours
45: *
46: * Nested timers can be used to get the following table:
47: *
48: * [1 call]: awake 14:30 hours
49: * [1 call]: morning routine 0:13 hours ( 2 % of awake)
50: * [1 call]: eat 0:09 hours (69 % of morning routine)
51: * rest (morning routine) 0:04 hours (31 % of morning routine)
52: * [1 call]: work 7:45 hours (53 % of awake)
53: * [1 call]: eat 0:30 hours ( 6 % of work)
54: * rest (work) 7:15 hours (94 % of work)
55: * [1 call]: evening routine 5:30 hours (38 % of awake)
56: * [1 call]: eat 0:45 hours (14 % of evening routine)
57: * rest (evening routine) 4:45 hours (86 % of morning routine)
58: *
59: * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least,
60: * the nested timers make the stages unnecessary.
61: *
62: */
64: /*
65: * Data structures for keeping track of nested timers:
66: *
67: * nestedEvents: information about the timers that have actually been activated
68: * dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
69: *
70: * The Default-timers are used to time the nested timers. Every nested timer corresponds to
71: * (one or more) default timers, where one of the default timers has the same event-id as the
72: * nested one.
73: *
74: * Because of the risk of confusion between nested timer ids and default timer ids, we
75: * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent
76: * only for default events. Also, all nested event variables are prepended with 'nst', and
77: * default timers with 'dft'.
78: */
80: #define DFT_ID_AWAKE -1
82: typedef PetscLogEvent NestedEventId;
83: typedef struct {
84: NestedEventId nstEvent; /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
85: int nParents; /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
86: PetscLogEvent *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
87: PetscLogEvent *dftEvents; /* The default timers which represent the different 'instances' of this nested event */
89: PetscLogEvent *dftParents; /* The default timers which were the dftParentActive when this nested event was started */
90: PetscLogEvent *dftEventsSorted; /* The default timers which represent the different 'instances' of this nested event */
91: } PetscNestedEvent;
93: static PetscLogEvent dftParentActive = DFT_ID_AWAKE;
94: static int nNestedEvents = 0;
95: static int nNestedEventsAllocated = 0;
96: static PetscNestedEvent *nestedEvents = NULL;
97: static PetscLogDouble thresholdTime = 0.01; /* initial value was 0.1 */
99: #define THRESHOLD (thresholdTime/100.0+1e-12)
101: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
102: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
103: PETSC_INTERN PetscErrorCode PetscLogView_Nested(PetscViewer);
104: PETSC_INTERN PetscErrorCode PetscLogView_Flamegraph(PetscViewer);
106: /*@C
107: PetscLogNestedBegin - Turns on nested logging of objects and events. This logs flop
108: rates and object creation and should not slow programs down too much.
110: Logically Collective over PETSC_COMM_WORLD
112: Options Database Keys:
113: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file
115: Usage:
116: .vb
117: PetscInitialize(...);
118: PetscLogNestedBegin();
119: ... code ...
120: PetscLogView(viewer);
121: PetscFinalize();
122: .ve
124: Level: advanced
126: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin()
127: @*/
128: PetscErrorCode PetscLogNestedBegin(void)
129: {
133: if (nestedEvents) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"nestedEvents already allocated");
135: nNestedEventsAllocated = 10;
136: PetscMalloc1(nNestedEventsAllocated,&nestedEvents);
137: dftParentActive = DFT_ID_AWAKE;
138: nNestedEvents =1;
140: /* 'Awake' is nested event 0. It has no parents */
141: nestedEvents[0].nstEvent = 0;
142: nestedEvents[0].nParents = 0;
143: nestedEvents[0].dftParentsSorted = NULL;
144: nestedEvents[0].dftEvents = NULL;
145: nestedEvents[0].dftParents = NULL;
146: nestedEvents[0].dftEventsSorted = NULL;
148: PetscLogSet(PetscLogEventBeginNested,PetscLogEventEndNested);
149: return(0);
150: }
152: /* Delete the data structures for the nested timers */
153: PetscErrorCode PetscLogNestedEnd(void)
154: {
156: int i;
159: if (!nestedEvents) return(0);
160: for (i=0; i<nNestedEvents; i++) {
161: PetscFree4(nestedEvents[i].dftParentsSorted,nestedEvents[i].dftEventsSorted,nestedEvents[i].dftParents,nestedEvents[i].dftEvents);
162: }
163: PetscFree(nestedEvents);
164: nestedEvents = NULL;
165: nNestedEvents = 0;
166: nNestedEventsAllocated = 0;
167: return(0);
168: }
170: /*
171: UTILITIES: FIND STUFF IN SORTED ARRAYS
173: dftIndex - index to be found
174: dftArray - sorted array of PetscLogEvent-ids
175: narray - dimension of dftArray
176: entry - entry in the array where dftIndex may be found;
178: if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray
179: In that case, the dftIndex can be inserted at this entry.
180: */
181: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex,const PetscLogEvent *dftArray,int narray,int *entry)
182: {
184: if (narray==0 || dftIndex <= dftArray[0]) {
185: *entry = 0;
186: } else if (dftIndex > dftArray[narray-1]) {
187: *entry = narray;
188: } else {
189: int ihigh = narray-1, ilow=0;
190: while (ihigh>ilow) {
191: const int imiddle = (ihigh+ilow)/2;
192: if (dftArray[imiddle] > dftIndex) {
193: ihigh = imiddle;
194: } else if (dftArray[imiddle]<dftIndex) {
195: ilow = imiddle+1;
196: } else {
197: ihigh = imiddle;
198: ilow = imiddle;
199: }
200: }
201: *entry = ihigh;
202: }
203: return(0);
204: }
206: /*
207: Utility: find the nested event with given identification
209: nstEvent - Nested event to be found
210: entry - entry in the nestedEvents where nstEvent may be found;
212: if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray
213: */
214: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent,int *entry)
215: {
217: if (nNestedEvents==0 || nstEvent <= nestedEvents[0].nstEvent) {
218: *entry = 0;
219: } else if (nstEvent > nestedEvents[nNestedEvents-1].nstEvent) {
220: *entry = nNestedEvents;
221: } else {
222: int ihigh = nNestedEvents-1, ilow = 0;
223: while (ihigh>ilow) {
224: const int imiddle = (ihigh+ilow)/2;
225: if (nestedEvents[imiddle].nstEvent > nstEvent) {
226: ihigh = imiddle;
227: } else if (nestedEvents[imiddle].nstEvent<nstEvent) {
228: ilow = imiddle+1;
229: } else {
230: ihigh = imiddle;
231: ilow = imiddle;
232: }
233: }
234: *entry = ihigh;
235: }
236: return(0);
237: }
239: /*
240: Nested logging is not prepared yet to support user-defined logging stages, so for now we force logging on the main stage.
241: Using PetscLogStage{Push/Pop}() would be more appropriate, but these two calls do extra bookkeeping work we don't need.
242: */
244: #define MAINSTAGE 0
246: static PetscLogStage savedStage = 0;
248: PETSC_STATIC_INLINE PetscErrorCode PetscLogStageOverride(void)
249: {
250: PetscStageLog stageLog = petsc_stageLog;
254: if (stageLog->curStage == MAINSTAGE) return(0);
255: savedStage = stageLog->curStage;
256: stageLog->curStage = MAINSTAGE;
257: PetscIntStackPush(stageLog->stack, MAINSTAGE);
258: return(0);
259: }
261: PETSC_STATIC_INLINE PetscErrorCode PetscLogStageRestore(void)
262: {
263: PetscStageLog stageLog = petsc_stageLog;
267: if (savedStage == MAINSTAGE) return(0);
268: stageLog->curStage = savedStage;
269: PetscIntStackPop(stageLog->stack, &savedStage);
270: return(0);
271: }
273: /******************************************************************************************/
274: /* Start a nested event */
275: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
276: {
277: PetscErrorCode ierr;
278: int entry, pentry, tentry,i;
279: PetscLogEvent dftEvent;
282: PetscLogEventFindNestedTimer(nstEvent, &entry);
283: if (entry>=nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
284: /* Nested event doesn't exist yet: create it */
286: if (nNestedEvents==nNestedEventsAllocated) {
287: /* Enlarge and re-allocate nestedEvents if needed */
288: PetscNestedEvent *tmp = nestedEvents;
289: PetscMalloc1(2*nNestedEvents,&nestedEvents);
290: nNestedEventsAllocated*=2;
291: PetscArraycpy(nestedEvents, tmp, nNestedEvents);
292: PetscFree(tmp);
293: }
295: /* Clear space in nestedEvents for new nested event */
296: nNestedEvents++;
297: for (i = nNestedEvents-1; i>entry; i--) {
298: nestedEvents[i] = nestedEvents[i-1];
299: }
301: /* Create event in nestedEvents */
302: nestedEvents[entry].nstEvent = nstEvent;
303: nestedEvents[entry].nParents=1;
304: PetscMalloc4(1,&nestedEvents[entry].dftParentsSorted,1,&nestedEvents[entry].dftEventsSorted,1,&nestedEvents[entry].dftParents,1,&nestedEvents[entry].dftEvents);
306: /* Fill in new event */
307: pentry = 0;
308: dftEvent = (PetscLogEvent) nstEvent;
310: nestedEvents[entry].nstEvent = nstEvent;
311: nestedEvents[entry].dftParents[pentry] = dftParentActive;
312: nestedEvents[entry].dftEvents[pentry] = dftEvent;
313: nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
314: nestedEvents[entry].dftEventsSorted[pentry] = dftEvent;
316: } else {
317: /* Nested event exists: find current dftParentActive among parents */
318: PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
319: PetscLogEvent *dftEvents = nestedEvents[entry].dftEvents;
320: int nParents = nestedEvents[entry].nParents;
322: PetscLogEventFindDefaultTimer( dftParentActive, dftParentsSorted, nParents, &pentry);
324: if (pentry>=nParents || dftParentActive != dftParentsSorted[pentry]) {
325: /* dftParentActive not in the list: add it to the list */
326: int i;
327: PetscLogEvent *dftParents = nestedEvents[entry].dftParents;
328: PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
329: char name[100];
331: /* Register a new default timer */
332: sprintf(name, "%d -> %d", (int) dftParentActive, (int) nstEvent);
333: PetscLogEventRegister(name, 0, &dftEvent);
334: PetscLogEventFindDefaultTimer( dftEvent, dftEventsSorted, nParents, &tentry);
336: /* Reallocate parents and dftEvents to make space for new parent */
337: PetscMalloc4(1+nParents,&nestedEvents[entry].dftParentsSorted,1+nParents,&nestedEvents[entry].dftEventsSorted,1+nParents,&nestedEvents[entry].dftParents,1+nParents,&nestedEvents[entry].dftEvents);
338: PetscArraycpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents);
339: PetscArraycpy(nestedEvents[entry].dftEventsSorted, dftEventsSorted, nParents);
340: PetscArraycpy(nestedEvents[entry].dftParents, dftParents, nParents);
341: PetscArraycpy(nestedEvents[entry].dftEvents, dftEvents, nParents);
342: PetscFree4(dftParentsSorted,dftEventsSorted,dftParents,dftEvents);
344: dftParents = nestedEvents[entry].dftParents;
345: dftEvents = nestedEvents[entry].dftEvents;
346: dftParentsSorted = nestedEvents[entry].dftParentsSorted;
347: dftEventsSorted = nestedEvents[entry].dftEventsSorted;
349: nestedEvents[entry].nParents++;
350: nParents++;
352: for (i = nParents-1; i>pentry; i--) {
353: dftParentsSorted[i] = dftParentsSorted[i-1];
354: dftEvents[i] = dftEvents[i-1];
355: }
356: for (i = nParents-1; i>tentry; i--) {
357: dftParents[i] = dftParents[i-1];
358: dftEventsSorted[i] = dftEventsSorted[i-1];
359: }
361: /* Fill in the new default timer */
362: dftParentsSorted[pentry] = dftParentActive;
363: dftEvents[pentry] = dftEvent;
364: dftParents[tentry] = dftParentActive;
365: dftEventsSorted[tentry] = dftEvent;
367: } else {
368: /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
369: dftEvent = nestedEvents[entry].dftEvents[pentry];
370: }
371: }
373: /* Start the default 'dftEvent'-timer and update the dftParentActive */
374: PetscLogStageOverride();
375: PetscLogEventBeginDefault(dftEvent,t,o1,o2,o3,o4);
376: PetscLogStageRestore();
377: dftParentActive = dftEvent;
378: return(0);
379: }
381: /* End a nested event */
382: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
383: {
384: PetscErrorCode ierr;
385: int entry, pentry, nParents;
386: PetscLogEvent *dftEventsSorted;
389: /* Find the nested event */
390: PetscLogEventFindNestedTimer(nstEvent, &entry);
391: if (entry>=nNestedEvents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d larger than number of events %d",entry,nNestedEvents);
392: if (nestedEvents[entry].nstEvent != nstEvent) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d had unbalanced begin/end pairs does not match %d",entry,nstEvent);
393: dftEventsSorted = nestedEvents[entry].dftEventsSorted;
394: nParents = nestedEvents[entry].nParents;
396: /* Find the current default timer among the 'dftEvents' of this event */
397: PetscLogEventFindDefaultTimer( dftParentActive, dftEventsSorted, nParents, &pentry);
399: if (pentry>=nParents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Entry %d is larger than number of parents %d",pentry,nParents);
400: if (dftEventsSorted[pentry] != dftParentActive) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Active parent is %d, but we seem to be closing %d",dftParentActive,dftEventsSorted[pentry]);
402: /* Stop the default timer and update the dftParentActive */
403: PetscLogStageOverride();
404: PetscLogEventEndDefault(dftParentActive,t,o1,o2,o3,o4);
405: PetscLogStageRestore();
406: dftParentActive = nestedEvents[entry].dftParents[pentry];
407: return(0);
408: }
410: /*@
411: PetscLogSetThreshold - Set the threshold time for logging the events; this is a percentage out of 100, so 1. means any event
412: that takes 1 or more percent of the time.
414: Logically Collective over PETSC_COMM_WORLD
416: Input Parameter:
417: . newThresh - the threshold to use
419: Output Parameter:
420: . oldThresh - the previously set threshold value
422: Options Database Keys:
423: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file
425: Usage:
426: .vb
427: PetscInitialize(...);
428: PetscLogNestedBegin();
429: PetscLogSetThreshold(0.1,&oldthresh);
430: ... code ...
431: PetscLogView(viewer);
432: PetscFinalize();
433: .ve
435: Level: advanced
437: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin(),
438: PetscLogNestedBegin()
439: @*/
440: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
441: {
443: if (oldThresh) *oldThresh = thresholdTime;
444: if (newThresh == PETSC_DECIDE) newThresh = 0.01;
445: if (newThresh == PETSC_DEFAULT) newThresh = 0.01;
446: thresholdTime = PetscMax(newThresh, 0.0);
447: return(0);
448: }
450: static PetscErrorCode PetscPrintExeSpecs(PetscViewer viewer)
451: {
452: PetscErrorCode ierr;
453: char arch[128],hostname[128],username[128],pname[PETSC_MAX_PATH_LEN],date[128];
454: char version[256], buildoptions[128] = "";
455: PetscMPIInt size;
456: size_t len;
459: MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
460: PetscGetArchType(arch,sizeof(arch));
461: PetscGetHostName(hostname,sizeof(hostname));
462: PetscGetUserName(username,sizeof(username));
463: PetscGetProgramName(pname,sizeof(pname));
464: PetscGetDate(date,sizeof(date));
465: PetscGetVersion(version,sizeof(version));
467: PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification");
468: PetscViewerXMLPutString( viewer, "executable" , "Executable" , pname);
469: PetscViewerXMLPutString( viewer, "architecture", "Architecture" , arch);
470: PetscViewerXMLPutString( viewer, "hostname" , "Host" , hostname);
471: PetscViewerXMLPutInt( viewer, "nprocesses" , "Number of processes", size);
472: PetscViewerXMLPutString( viewer, "user" , "Run by user" , username);
473: PetscViewerXMLPutString( viewer, "date" , "Started at" , date);
474: PetscViewerXMLPutString( viewer, "petscrelease", "Petsc Release", version);
476: if (PetscDefined(USE_DEBUG)) {
477: PetscStrlcat(buildoptions, "Debug ", sizeof(buildoptions));
478: }
479: if (PetscDefined(USE_COMPLEX)) {
480: PetscStrlcat(buildoptions, "Complex ", sizeof(buildoptions));
481: }
482: if (PetscDefined(USE_REAL_SINGLE)) {
483: PetscStrlcat(buildoptions, "Single ", sizeof(buildoptions));
484: } else if (PetscDefined(USE_REAL___FLOAT128)) {
485: PetscStrlcat(buildoptions, "Quadruple ", sizeof(buildoptions));
486: } else if (PetscDefined(USE_REAL___FP16)) {
487: PetscStrlcat(buildoptions, "Half ", sizeof(buildoptions));
488: }
489: if (PetscDefined(USE_64BIT_INDICES)) {
490: PetscStrlcat(buildoptions, "Int64 ", sizeof(buildoptions));
491: }
492: #if defined(__cplusplus)
493: PetscStrlcat(buildoptions, "C++ ", sizeof(buildoptions));
494: #endif
495: PetscStrlen(buildoptions,&len);
496: if (len) {
497: PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions);
498: }
499: PetscViewerXMLEndSection(viewer, "runspecification");
500: return(0);
501: }
503: /* Print the global performance: max, max/min, average and total of
504: * time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
505: */
506: static PetscErrorCode PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble local_val, const PetscBool print_average, const PetscBool print_total)
507: {
508: PetscErrorCode ierr;
509: PetscLogDouble min, tot, ratio, avg;
510: MPI_Comm comm;
511: PetscMPIInt rank, size;
512: PetscLogDouble valrank[2], max[2];
515: PetscObjectGetComm((PetscObject)viewer,&comm);
516: MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
517: MPI_Comm_rank(comm, &rank);
519: valrank[0] = local_val;
520: valrank[1] = (PetscLogDouble) rank;
521: MPIU_Allreduce(&local_val, &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
522: MPIU_Allreduce(valrank, &max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
523: MPIU_Allreduce(&local_val, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
524: avg = tot/((PetscLogDouble) size);
525: if (min != 0.0) ratio = max[0]/min;
526: else ratio = 0.0;
528: PetscViewerXMLStartSection(viewer, name, desc);
529: PetscViewerXMLPutDouble(viewer, "max", NULL, max[0], "%e");
530: PetscViewerXMLPutInt( viewer, "maxrank" , "rank at which max was found" , (PetscMPIInt) max[1]);
531: PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f");
532: if (print_average) {
533: PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e");
534: }
535: if (print_total) {
536: PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e");
537: }
538: PetscViewerXMLEndSection(viewer, name);
539: return(0);
540: }
542: /* Print the global performance: max, max/min, average and total of
543: * time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
544: */
545: static PetscErrorCode PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
546: {
547: PetscErrorCode ierr;
548: PetscLogDouble flops, mem, red, mess;
549: const PetscBool print_total_yes = PETSC_TRUE,
550: print_total_no = PETSC_FALSE,
551: print_average_no = PETSC_FALSE,
552: print_average_yes = PETSC_TRUE;
555: /* Must preserve reduction count before we go on */
556: red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;
558: /* Calculate summary information */
559: PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance");
561: /* Time */
562: PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", locTotalTime, print_average_yes, print_total_no);
564: /* Objects */
565: PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", (PetscLogDouble) petsc_numObjects, print_average_yes, print_total_no);
567: /* Flop */
568: PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", petsc_TotalFlops/1.0E6, print_average_yes, print_total_yes);
570: /* Flop/sec -- Must talk to Barry here */
571: if (locTotalTime != 0.0) flops = petsc_TotalFlops/locTotalTime;
572: else flops = 0.0;
573: PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", flops/1.0E6, print_average_yes, print_total_yes);
575: /* Memory */
576: PetscMallocGetMaximumUsage(&mem);
577: if (mem > 0.0) {
578: PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", mem/1024.0/1024.0, print_average_yes, print_total_yes);
579: }
580: /* Messages */
581: mess = 0.5*(petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
582: PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", mess, print_average_yes, print_total_yes);
584: /* Message Volume */
585: mess = 0.5*(petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
586: PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", mess/1024.0/1024.0, print_average_yes, print_total_yes);
588: /* Reductions */
589: PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", red , print_average_no, print_total_no);
590: PetscViewerXMLEndSection(viewer, "globalperformance");
591: return(0);
592: }
594: typedef struct {
595: PetscLogEvent dftEvent;
596: NestedEventId nstEvent;
597: PetscLogEvent dftParent;
598: NestedEventId nstParent;
599: PetscBool own;
600: int depth;
601: NestedEventId* nstPath;
602: } PetscNestedEventTree;
604: /* Compare timers to sort them in the tree */
605: static int compareTreeItems(const void *item1_, const void *item2_)
606: {
607: int i;
608: PetscNestedEventTree *item1 = (PetscNestedEventTree *) item1_;
609: PetscNestedEventTree *item2 = (PetscNestedEventTree *) item2_;
611: for (i=0; i<PetscMin(item1->depth,item2->depth); i++) {
612: if (item1->nstPath[i]<item2->nstPath[i]) return -1;
613: if (item1->nstPath[i]>item2->nstPath[i]) return +1;
614: }
615: if (item1->depth < item2->depth) return -1;
616: if (item1->depth > item2->depth) return 1;
617: return 0;
618: }
619: /*
620: * Do MPI communication to get the complete, nested calling tree for all processes: there may be
621: * calls that happen in some processes, but not in others.
622: *
623: * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
624: * The tree is sorted so that the timers can be printed in the order of appearance.
625: *
626: * For tree-items which appear in the trees of multiple processes (which will be most items), the
627: * following rule is followed:
628: * + if information from my own process is available, then that is the information stored in tree.
629: * otherwise it is some other process's information.
630: */
631: static PetscErrorCode PetscLogNestedTreeCreate(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
632: {
633: PetscNestedEventTree *tree = NULL, *newTree;
634: int *treeIndices;
635: int nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
636: int yesno;
637: PetscBool done;
638: PetscErrorCode ierr;
639: int maxdepth;
640: int depth;
641: int illegalEvent;
642: int iextra;
643: NestedEventId *nstPath, *nstMyPath;
644: MPI_Comm comm;
647: PetscObjectGetComm((PetscObject)viewer,&comm);
649: /* Calculate memory needed to store everybody's information and allocate tree */
650: nTimers = 0;
651: for (i=0; i<nNestedEvents; i++) nTimers += nestedEvents[i].nParents;
653: PetscMalloc1(nTimers,&tree);
655: /* Fill tree with readily available information */
656: iTimer0 = 0;
657: maxDefaultTimer =0;
658: for (i=0; i<nNestedEvents; i++) {
659: int nParents = nestedEvents[i].nParents;
660: NestedEventId nstEvent = nestedEvents[i].nstEvent;
661: PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
662: PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
663: for (j=0; j<nParents; j++) {
664: maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
666: tree[iTimer0+j].dftEvent = dftEvents[j];
667: tree[iTimer0+j].nstEvent = nstEvent;
668: tree[iTimer0+j].dftParent = dftParentsSorted[j];
669: tree[iTimer0+j].own = PETSC_TRUE;
671: tree[iTimer0+j].nstParent = 0;
672: tree[iTimer0+j].depth = 0;
673: tree[iTimer0+j].nstPath = NULL;
674: }
675: iTimer0 += nParents;
676: }
678: /* Calculate the global maximum for the default timer index, so array treeIndices can
679: * be allocated only once */
680: MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm);
681: maxDefaultTimer = j;
683: /* Find default timer's place in the tree */
684: PetscCalloc1(maxDefaultTimer+1,&treeIndices);
685: treeIndices[0] = 0;
686: for (i=0; i<nTimers; i++) {
687: PetscLogEvent dftEvent = tree[i].dftEvent;
688: treeIndices[dftEvent] = i;
689: }
691: /* Find each dftParent's nested identification */
692: for (i=0; i<nTimers; i++) {
693: PetscLogEvent dftParent = tree[i].dftParent;
694: if (dftParent!= DFT_ID_AWAKE) {
695: int j = treeIndices[dftParent];
696: tree[i].nstParent = tree[j].nstEvent;
697: }
698: }
700: /* Find depths for each timer path */
701: done = PETSC_FALSE;
702: maxdepth = 0;
703: while (!done) {
704: done = PETSC_TRUE;
705: for (i=0; i<nTimers; i++) {
706: if (tree[i].dftParent == DFT_ID_AWAKE) {
707: tree[i].depth = 1;
708: maxdepth = PetscMax(1,maxdepth);
709: } else {
710: int j = treeIndices[tree[i].dftParent];
711: depth = 1+tree[j].depth;
712: if (depth>tree[i].depth) {
713: done = PETSC_FALSE;
714: tree[i].depth = depth;
715: maxdepth = PetscMax(depth,maxdepth);
716: }
717: }
718: }
719: }
721: /* Allocate the paths in the entire tree */
722: for (i=0; i<nTimers; i++) {
723: depth = tree[i].depth;
724: PetscCalloc1(depth,&tree[i].nstPath);
725: }
727: /* Calculate the paths for all timers */
728: for (depth=1; depth<=maxdepth; depth++) {
729: for (i=0; i<nTimers; i++) {
730: if (tree[i].depth==depth) {
731: if (depth>1) {
732: int j = treeIndices[tree[i].dftParent];
733: PetscArraycpy(tree[i].nstPath,tree[j].nstPath,depth-1);
734: }
735: tree[i].nstPath[depth-1] = tree[i].nstEvent;
736: }
737: }
738: }
739: PetscFree(treeIndices);
741: /* Sort the tree on basis of the paths */
742: qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);
744: /* Allocate an array to store paths */
745: depth = maxdepth;
746: MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm);
747: PetscMalloc1(maxdepth+1, &nstPath);
748: PetscMalloc1(maxdepth+1, &nstMyPath);
750: /* Find an illegal nested event index (1+largest nested event index) */
751: illegalEvent = 1+nestedEvents[nNestedEvents-1].nstEvent;
752: i = illegalEvent;
753: MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm);
755: /* First, detect timers which are not available in this process, but are available in others
756: * Allocate a new tree, that can contain all timers
757: * Then, fill the new tree with all (own and not-own) timers */
758: newTree= NULL;
759: for (yesno=0; yesno<=1; yesno++) {
760: depth = 1;
761: i = 0;
762: iextra = 0;
763: while (depth>0) {
764: int j;
765: PetscBool same;
767: /* Construct the next path in this process's tree:
768: * if necessary, supplement with invalid path entries */
769: depth++;
770: if (depth > maxdepth + 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Depth %d > maxdepth+1 %d",depth,maxdepth+1);
771: if (i<nTimers) {
772: for (j=0; j<tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
773: for (j=tree[i].depth; j<depth; j++) nstMyPath[j] = illegalEvent;
774: } else {
775: for (j=0; j<depth; j++) nstMyPath[j] = illegalEvent;
776: }
778: /* Communicate with other processes to obtain the next path and its depth */
779: MPIU_Allreduce(nstMyPath, nstPath, depth, MPI_INT, MPI_MIN, comm);
780: for (j=depth-1; (int) j>=0; j--) {
781: if (nstPath[j]==illegalEvent) depth=j;
782: }
784: if (depth>0) {
785: /* If the path exists */
787: /* check whether the next path is the same as this process's next path */
788: same = PETSC_TRUE;
789: for (j=0; same && j<depth; j++) { same = (same && nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;}
791: if (same) {
792: /* Register 'own path' */
793: if (newTree) newTree[i+iextra] = tree[i];
794: i++;
795: } else {
796: /* Register 'not an own path' */
797: if (newTree) {
798: newTree[i+iextra].nstEvent = nstPath[depth-1];
799: newTree[i+iextra].own = PETSC_FALSE;
800: newTree[i+iextra].depth = depth;
801: PetscMalloc1(depth, &newTree[i+iextra].nstPath);
802: for (j=0; j<depth; j++) {newTree[i+iextra].nstPath[j] = nstPath[j];}
804: newTree[i+iextra].dftEvent = 0;
805: newTree[i+iextra].dftParent = 0;
806: newTree[i+iextra].nstParent = 0;
807: }
808: iextra++;
809: }
811: }
812: }
814: /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
815: totalNTimers = nTimers + iextra;
816: if (!newTree) {
817: PetscMalloc1(totalNTimers, &newTree);
818: }
819: }
820: PetscFree(nstPath);
821: PetscFree(nstMyPath);
822: PetscFree(tree);
823: tree = newTree;
824: newTree = NULL;
826: /* Set return value and return */
827: *p_tree = tree;
828: *p_nTimers = totalNTimers;
829: return(0);
830: }
832: /*
833: * Delete the nested timer tree
834: */
835: static PetscErrorCode PetscLogNestedTreeDestroy(PetscNestedEventTree *tree, int nTimers)
836: {
837: int i;
838: PetscErrorCode ierr;
841: for (i=0; i<nTimers; i++) {
842: PetscFree(tree[i].nstPath);
843: }
844: PetscFree(tree);
845: return(0);
846: }
848: /* Print the global performance: max, max/min, average and total of
849: * time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
850: */
851: static PetscErrorCode PetscPrintXMLNestedLinePerfResults(PetscViewer viewer,const char *name,PetscLogDouble value,PetscLogDouble minthreshold,PetscLogDouble maxthreshold,PetscLogDouble minmaxtreshold)
852: {
853: MPI_Comm comm; /* MPI communicator in reduction */
854: PetscMPIInt rank; /* rank of this process */
855: PetscLogDouble val_in[2], max[2], min[2];
856: PetscLogDouble minvalue, maxvalue, tot;
857: PetscMPIInt size;
858: PetscMPIInt minLoc, maxLoc;
862: PetscObjectGetComm((PetscObject)viewer,&comm);
863: MPI_Comm_size(comm, &size);
864: MPI_Comm_rank(comm, &rank);
865: val_in[0] = value;
866: val_in[1] = (PetscLogDouble) rank;
867: MPIU_Allreduce(val_in, max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
868: MPIU_Allreduce(val_in, min, 1, MPIU_2PETSCLOGDOUBLE, MPI_MINLOC, comm);
869: maxvalue = max[0];
870: maxLoc = (PetscMPIInt) max[1];
871: minvalue = min[0];
872: minLoc = (PetscMPIInt) min[1];
873: MPIU_Allreduce(&value, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
875: if (maxvalue<maxthreshold && minvalue>=minthreshold) {
876: /* One call per parent or NO value: don't print */
877: } else {
878: PetscViewerXMLStartSection(viewer, name, NULL);
879: if (maxvalue>minvalue*minmaxtreshold) {
880: PetscViewerXMLPutDouble(viewer, "avgvalue", NULL, tot/size, "%g");
881: PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%g");
882: PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%g");
883: PetscViewerXMLPutInt( viewer, "minloc" , NULL, minLoc);
884: PetscViewerXMLPutInt( viewer, "maxloc" , NULL, maxLoc);
885: } else {
886: PetscViewerXMLPutDouble(viewer, "value", NULL, tot/size, "%g");
887: }
888: PetscViewerXMLEndSection(viewer, name);
889: }
890: return(0);
891: }
893: #define N_COMM 8
894: static PetscErrorCode PetscLogNestedTreePrintLine(PetscViewer viewer,PetscEventPerfInfo perfInfo,PetscLogDouble countsPerCall,int parentCount,int depth,const char *name,PetscLogDouble totalTime,PetscBool *isPrinted)
895: {
896: PetscLogDouble time = perfInfo.time;
897: PetscLogDouble timeMx;
899: MPI_Comm comm;
902: PetscObjectGetComm((PetscObject)viewer,&comm);
903: MPIU_Allreduce(&time, &timeMx, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
904: *isPrinted = ((timeMx/totalTime) >= THRESHOLD) ? PETSC_TRUE : PETSC_FALSE;
905: if (*isPrinted) {
906: PetscViewerXMLStartSection(viewer, "event", NULL);
907: PetscViewerXMLPutString(viewer, "name", NULL, name);
908: PetscPrintXMLNestedLinePerfResults(viewer, "time", time/totalTime*100.0, 0, 0, 1.02);
909: PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", parentCount>0 ? countsPerCall : 0, 0.99, 1.01, 1.02);
910: PetscPrintXMLNestedLinePerfResults(viewer, "mflops", time>=timeMx*0.001 ? 1e-6*perfInfo.flops/time : 0, 0, 0.01, 1.05);
911: PetscPrintXMLNestedLinePerfResults(viewer, "mbps",time>=timeMx*0.001 ? perfInfo.messageLength/(1024*1024*time) : 0, 0, 0.01, 1.05);
912: PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", time>=timeMx*0.001 ? perfInfo.numReductions/time : 0, 0, 0.01, 1.05);
913: }
914: return(0);
915: }
917: /* Count the number of times the parent event was called */
919: static int countParents( const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
920: {
921: if (tree[i].depth<=1) {
922: return 1; /* Main event: only once */
923: } else if (!tree[i].own) {
924: return 1; /* This event didn't happen in this process, but did in another */
925: } else {
926: int iParent;
927: for (iParent=i-1; iParent>=0; iParent--) {
928: if (tree[iParent].depth == tree[i].depth-1) break;
929: }
930: if (tree[iParent].depth != tree[i].depth-1) {
931: /* ***** Internal error: cannot find parent */
932: return -2;
933: } else {
934: PetscLogEvent dftEvent = tree[iParent].dftEvent;
935: return eventPerfInfo[dftEvent].count;
936: }
937: }
938: }
940: typedef struct {
941: int id;
942: PetscLogDouble val;
943: } PetscSortItem;
945: static int compareSortItems(const void *item1_, const void *item2_)
946: {
947: PetscSortItem *item1 = (PetscSortItem *) item1_;
948: PetscSortItem *item2 = (PetscSortItem *) item2_;
949: if (item1->val > item2->val) return -1;
950: if (item1->val < item2->val) return +1;
951: return 0;
952: }
954: /*
955: * Find the number of child events.
956: */
957: static PetscErrorCode PetscLogNestedTreeGetChildrenCount(const PetscNestedEventTree *tree,int nTimers,int iStart,int depth,int *nChildren)
958: {
959: int n=0;
962: for (int i=iStart+1; i<nTimers; i++) {
963: if (tree[i].depth <= depth) break;
964: if (tree[i].depth == depth + 1) n++;
965: }
966: *nChildren = n;
967: return(0);
968: }
970: /*
971: * Initialize child event sort items with ID and times.
972: */
973: static PetscErrorCode PetscLogNestedTreeSetChildrenSortItems(const PetscViewer viewer,const PetscNestedEventTree *tree,int nTimers,int iStart,int depth,int nChildren,PetscSortItem **children)
974: {
975: MPI_Comm comm;
976: PetscLogDouble *times, *maxTimes;
977: PetscErrorCode ierr;
978: PetscStageLog stageLog;
979: PetscEventPerfInfo *eventPerfInfo;
980: const int stage = MAINSTAGE;
983: PetscObjectGetComm((PetscObject)viewer,&comm);
984: PetscLogGetStageLog(&stageLog);
985: eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
987: if (nChildren>0) {
988: /* Create an array for the id-s and maxTimes of the children,
989: * leaving 2 spaces for self-time and other-time */
991: PetscMalloc1(nChildren+2,children);
992: nChildren = 0;
993: for (int i=iStart+1; i<nTimers; i++) {
994: if (tree[i].depth<=depth) break;
995: if (tree[i].depth == depth + 1) {
996: (*children)[nChildren].id = i;
997: (*children)[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
998: nChildren++;
999: }
1000: }
1002: /* Calculate the children's maximum times, to see whether children will be ignored or printed */
1003: PetscMalloc1(nChildren,×);
1004: for (int i=0; i<nChildren; i++) { times[i] = (*children)[i].val; }
1006: PetscMalloc1(nChildren,&maxTimes);
1007: MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1008: PetscFree(times);
1010: for (int i=0; i<nChildren; i++) { (*children)[i].val = maxTimes[i]; }
1011: PetscFree(maxTimes);
1012: }
1013: return(0);
1014: }
1016: /*
1017: * Set 'self' and 'other' performance info.
1018: */
1019: static PetscErrorCode PetscLogNestedTreeSetSelfOtherPerfInfo(const PetscNestedEventTree *tree,int iStart,PetscLogDouble totalTime,const PetscSortItem *children,int nChildren,
1020: PetscEventPerfInfo *myPerfInfo,PetscEventPerfInfo *selfPerfInfo,PetscEventPerfInfo *otherPerfInfo,int *parentCount,PetscLogDouble *countsPerCall)
1021: {
1022: const int stage = MAINSTAGE;
1023: PetscErrorCode ierr;
1024: PetscStageLog stageLog;
1025: PetscEventPerfInfo *eventPerfInfo;
1028: PetscLogGetStageLog(&stageLog);
1029: eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
1030: if (!tree[iStart].own) {
1031: /* Set values for a timer that was not activated in this process
1032: * (but was, in other processes of this run) */
1033: PetscMemzero(myPerfInfo,sizeof(*myPerfInfo));
1035: *selfPerfInfo = *myPerfInfo;
1036: *otherPerfInfo = *myPerfInfo;
1038: *parentCount = 1;
1039: *countsPerCall = 0;
1040: } else {
1041: /* Set the values for a timer that was activated in this process */
1042: PetscLogEvent dftEvent = tree[iStart].dftEvent;
1044: *parentCount = countParents(tree,eventPerfInfo,iStart);
1045: *myPerfInfo = eventPerfInfo[dftEvent];
1046: *countsPerCall = (PetscLogDouble) myPerfInfo->count / (PetscLogDouble) *parentCount;
1048: *selfPerfInfo = *myPerfInfo;
1049: otherPerfInfo->time = 0;
1050: otherPerfInfo->flops = 0;
1051: otherPerfInfo->numMessages = 0;
1052: otherPerfInfo->messageLength = 0;
1053: otherPerfInfo->numReductions = 0;
1055: for (int i=0; i<nChildren; i++) {
1056: /* For all child counters: subtract the child values from self-timers */
1058: PetscLogEvent dftChild = tree[children[i].id].dftEvent;
1059: PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];
1061: selfPerfInfo->time -= childPerfInfo.time;
1062: selfPerfInfo->flops -= childPerfInfo.flops;
1063: selfPerfInfo->numMessages -= childPerfInfo.numMessages;
1064: selfPerfInfo->messageLength -= childPerfInfo.messageLength;
1065: selfPerfInfo->numReductions -= childPerfInfo.numReductions;
1067: if ((children[i].val/totalTime) < THRESHOLD) {
1068: /* Add them to 'other' if the time is ignored in the output */
1069: otherPerfInfo->time += childPerfInfo.time;
1070: otherPerfInfo->flops += childPerfInfo.flops;
1071: otherPerfInfo->numMessages += childPerfInfo.numMessages;
1072: otherPerfInfo->messageLength += childPerfInfo.messageLength;
1073: otherPerfInfo->numReductions += childPerfInfo.numReductions;
1074: }
1075: }
1076: }
1077: return(0);
1078: }
1080: /*
1081: * Set max times across ranks for 'self' and 'other'.
1082: */
1083: static PetscErrorCode PetscLogNestedTreeSetMaxTimes(MPI_Comm comm,int nChildren,const PetscEventPerfInfo selfPerfInfo,const PetscEventPerfInfo otherPerfInfo,PetscSortItem *children)
1084: {
1085: PetscLogDouble times[2], maxTimes[2];
1086: PetscErrorCode ierr;
1089: times[0] = selfPerfInfo.time;
1090: times[1] = otherPerfInfo.time;
1092: MPIU_Allreduce(times,maxTimes,2,MPIU_PETSCLOGDOUBLE,MPI_MAX,comm);
1093: children[nChildren+0].id = -1;
1094: children[nChildren+0].val = maxTimes[0];
1095: children[nChildren+1].id = -2;
1096: children[nChildren+1].val = maxTimes[1];
1097: return(0);
1098: }
1100: static PetscErrorCode PetscLogNestedTreePrint(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime)
1101: {
1102: int depth = tree[iStart].depth;
1103: const char *name;
1104: int parentCount=1, nChildren;
1105: PetscSortItem *children;
1106: PetscErrorCode ierr;
1107: PetscStageLog stageLog;
1108: PetscEventRegInfo *eventRegInfo;
1109: PetscEventPerfInfo myPerfInfo={0},selfPerfInfo={0},otherPerfInfo={0};
1110: PetscLogDouble countsPerCall=0;
1111: PetscBool wasPrinted;
1112: PetscBool childWasPrinted;
1113: MPI_Comm comm;
1116: /* Look up the name of the event and its PerfInfo */
1117: PetscLogGetStageLog(&stageLog);
1118: eventRegInfo = stageLog->eventLog->eventInfo;
1119: name = eventRegInfo[(PetscLogEvent)tree[iStart].nstEvent].name;
1120: PetscObjectGetComm((PetscObject)viewer,&comm);
1122: PetscLogNestedTreeGetChildrenCount(tree,nTimers,iStart,depth,&nChildren);
1123: PetscLogNestedTreeSetChildrenSortItems(viewer,tree,nTimers,iStart,depth,nChildren,&children);
1124: PetscLogNestedTreeSetSelfOtherPerfInfo(tree,iStart,totalTime,children,nChildren,&myPerfInfo,&selfPerfInfo,&otherPerfInfo,&parentCount,&countsPerCall);
1126: /* Main output for this timer */
1127: PetscLogNestedTreePrintLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted);
1129: /* Now print the lines for the children */
1130: if (nChildren > 0) {
1131: int i;
1133: /* Calculate max-times for 'self' and 'other' */
1134: PetscLogNestedTreeSetMaxTimes(comm,nChildren,selfPerfInfo,otherPerfInfo,children);
1136: /* Now sort the children (including 'self' and 'other') on total time */
1137: qsort(children, nChildren+2, sizeof(PetscSortItem), compareSortItems);
1139: /* Print (or ignore) the children in ascending order of total time */
1140: PetscViewerXMLStartSection(viewer,"events", NULL);
1141: for (i=0; i<nChildren+2; i++) {
1142: if ((children[i].val/totalTime) < THRESHOLD) {
1143: /* ignored: no output */
1144: } else if (children[i].id==-1) {
1145: PetscLogNestedTreePrintLine(viewer, selfPerfInfo, 1, parentCount, depth+1, "self", totalTime, &childWasPrinted);
1146: if (childWasPrinted) {
1147: PetscViewerXMLEndSection(viewer,"event");
1148: }
1149: } else if (children[i].id==-2) {
1150: size_t len;
1151: char *otherName;
1153: PetscStrlen(name,&len);
1154: PetscMalloc1(len+16,&otherName);
1155: PetscSNPrintf(otherName,len+16,"%s: other-timed",name);
1156: PetscLogNestedTreePrintLine(viewer, otherPerfInfo, 1, 1, depth+1, otherName, totalTime, &childWasPrinted);
1157: PetscFree(otherName);
1158: if (childWasPrinted) {
1159: PetscViewerXMLEndSection(viewer,"event");
1160: }
1161: } else {
1162: /* Print the child with a recursive call to this function */
1163: PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1164: }
1165: }
1166: PetscViewerXMLEndSection(viewer,"events");
1167: PetscFree(children);
1168: }
1170: if (wasPrinted) {
1171: PetscViewerXMLEndSection(viewer, "event");
1172: }
1173: return(0);
1174: }
1176: static PetscErrorCode PetscLogNestedTreePrintTop(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, PetscLogDouble totalTime)
1177: {
1178: int i, nChildren;
1179: PetscSortItem *children;
1180: PetscErrorCode ierr;
1181: MPI_Comm comm;
1184: PetscObjectGetComm((PetscObject)viewer,&comm);
1186: PetscLogNestedTreeGetChildrenCount(tree,nTimers,-1,0,&nChildren);
1187: PetscLogNestedTreeSetChildrenSortItems(viewer,tree,nTimers,-1,0,nChildren,&children);
1189: if (nChildren>0) {
1190: /* Now sort the children on total time */
1191: qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1192: /* Print (or ignore) the children in ascending order of total time */
1193: PetscViewerXMLStartSection(viewer, "timertree", "Timings tree");
1194: PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1195: PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, thresholdTime, "%f");
1197: for (i=0; i<nChildren; i++) {
1198: if ((children[i].val/totalTime) < THRESHOLD) {
1199: /* ignored: no output */
1200: } else {
1201: /* Print the child with a recursive call to this function */
1202: PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1203: }
1204: }
1205: PetscViewerXMLEndSection(viewer, "timertree");
1206: PetscFree(children);
1207: }
1208: return(0);
1209: }
1211: typedef struct {
1212: char *name;
1213: PetscLogDouble time;
1214: PetscLogDouble flops;
1215: PetscLogDouble numMessages;
1216: PetscLogDouble messageLength;
1217: PetscLogDouble numReductions;
1218: } PetscSelfTimer;
1220: static PetscErrorCode PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1221: {
1222: PetscErrorCode ierr;
1223: const int stage = MAINSTAGE;
1224: PetscStageLog stageLog;
1225: PetscEventRegInfo *eventRegInfo;
1226: PetscEventPerfInfo *eventPerfInfo;
1227: PetscSelfTimer *selftimes;
1228: PetscSelfTimer *totaltimes;
1229: NestedEventId *nstEvents;
1230: int i, j, maxDefaultTimer;
1231: NestedEventId nst;
1232: PetscLogEvent dft;
1233: int nstMax, nstMax_local;
1234: MPI_Comm comm;
1237: PetscObjectGetComm((PetscObject)viewer,&comm);
1238: PetscLogGetStageLog(&stageLog);
1239: eventRegInfo = stageLog->eventLog->eventInfo;
1240: eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
1242: /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1243: maxDefaultTimer =0;
1244: for (i=0; i<nNestedEvents; i++) {
1245: int nParents = nestedEvents[i].nParents;
1246: PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1247: for (j=0; j<nParents; j++) maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
1248: }
1249: PetscMalloc1(maxDefaultTimer+1,&nstEvents);
1250: for (dft=0; dft<maxDefaultTimer; dft++) {nstEvents[dft] = 0;}
1251: for (i=0; i<nNestedEvents; i++) {
1252: int nParents = nestedEvents[i].nParents;
1253: NestedEventId nstEvent = nestedEvents[i].nstEvent;
1254: PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1255: for (j=0; j<nParents; j++) nstEvents[dftEvents[j]] = nstEvent;
1256: }
1258: /* Calculate largest nested event-ID */
1259: nstMax_local = 0;
1260: for (i=0; i<nNestedEvents; i++) nstMax_local = PetscMax(nestedEvents[i].nstEvent,nstMax_local);
1261: MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm);
1263: /* Initialize all total-times with zero */
1264: PetscMalloc1(nstMax+1,&selftimes);
1265: PetscMalloc1(nstMax+1,&totaltimes);
1266: for (nst=0; nst<=nstMax; nst++) {
1267: totaltimes[nst].time = 0;
1268: totaltimes[nst].flops = 0;
1269: totaltimes[nst].numMessages = 0;
1270: totaltimes[nst].messageLength = 0;
1271: totaltimes[nst].numReductions = 0;
1272: totaltimes[nst].name = NULL;
1273: }
1275: /* Calculate total-times */
1276: for (i=0; i<nNestedEvents; i++) {
1277: const int nParents = nestedEvents[i].nParents;
1278: const NestedEventId nstEvent = nestedEvents[i].nstEvent;
1279: const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1280: for (j=0; j<nParents; j++) {
1281: const PetscLogEvent dftEvent = dftEvents[j];
1282: totaltimes[nstEvent].time += eventPerfInfo[dftEvent].time;
1283: totaltimes[nstEvent].flops += eventPerfInfo[dftEvent].flops;
1284: totaltimes[nstEvent].numMessages += eventPerfInfo[dftEvent].numMessages;
1285: totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1286: totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1287: }
1288: totaltimes[nstEvent].name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1289: }
1291: /* Initialize: self-times := totaltimes */
1292: for (nst=0; nst<=nstMax; nst++) { selftimes[nst] = totaltimes[nst]; }
1294: /* Subtract timed subprocesses from self-times */
1295: for (i=0; i<nNestedEvents; i++) {
1296: const int nParents = nestedEvents[i].nParents;
1297: const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1298: const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1299: for (j=0; j<nParents; j++) {
1300: if (dftParentsSorted[j] != DFT_ID_AWAKE) {
1301: const PetscLogEvent dftEvent = dftEvents[j];
1302: const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1303: selftimes[nstParent].time -= eventPerfInfo[dftEvent].time;
1304: selftimes[nstParent].flops -= eventPerfInfo[dftEvent].flops;
1305: selftimes[nstParent].numMessages -= eventPerfInfo[dftEvent].numMessages;
1306: selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1307: selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1308: }
1309: }
1310: }
1312: PetscFree(nstEvents);
1313: PetscFree(totaltimes);
1315: /* Set outputs */
1316: *p_self = selftimes;
1317: *p_nstMax = nstMax;
1318: return(0);
1319: }
1321: static PetscErrorCode PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1322: {
1323: PetscErrorCode ierr;
1324: int i;
1325: NestedEventId nst;
1326: PetscSortItem *sortSelfTimes;
1327: PetscLogDouble *times, *maxTimes;
1328: PetscStageLog stageLog;
1329: PetscEventRegInfo *eventRegInfo;
1330: const int dum_depth = 1, dum_count=1, dum_parentcount=1;
1331: PetscBool wasPrinted;
1332: MPI_Comm comm;
1335: PetscObjectGetComm((PetscObject)viewer,&comm);
1336: PetscLogGetStageLog(&stageLog);
1337: eventRegInfo = stageLog->eventLog->eventInfo;
1339: PetscMalloc1(nstMax+1,×);
1340: PetscMalloc1(nstMax+1,&maxTimes);
1341: for (nst=0; nst<=nstMax; nst++) { times[nst] = selftimes[nst].time;}
1342: MPIU_Allreduce(times, maxTimes, nstMax+1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1343: PetscFree(times);
1345: PetscMalloc1(nstMax+1,&sortSelfTimes);
1347: /* Sort the self-timers on basis of the largest time needed */
1348: for (nst=0; nst<=nstMax; nst++) {
1349: sortSelfTimes[nst].id = nst;
1350: sortSelfTimes[nst].val = maxTimes[nst];
1351: }
1352: PetscFree(maxTimes);
1353: qsort(sortSelfTimes, nstMax+1, sizeof(PetscSortItem), compareSortItems);
1355: PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings");
1356: PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1358: for (i=0; i<=nstMax; i++) {
1359: if ((sortSelfTimes[i].val/totalTime) >= THRESHOLD) {
1360: NestedEventId nstEvent = sortSelfTimes[i].id;
1361: const char *name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1362: PetscEventPerfInfo selfPerfInfo;
1364: selfPerfInfo.time = selftimes[nstEvent].time ;
1365: selfPerfInfo.flops = selftimes[nstEvent].flops;
1366: selfPerfInfo.numMessages = selftimes[nstEvent].numMessages;
1367: selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1368: selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;
1370: PetscLogNestedTreePrintLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted);
1371: if (wasPrinted) {
1372: PetscViewerXMLEndSection(viewer, "event");
1373: }
1374: }
1375: }
1376: PetscViewerXMLEndSection(viewer, "selftimertable");
1377: PetscFree(sortSelfTimes);
1378: return(0);
1379: }
1381: PetscErrorCode PetscLogView_Nested(PetscViewer viewer)
1382: {
1383: PetscErrorCode ierr;
1384: PetscLogDouble locTotalTime, globTotalTime;
1385: PetscNestedEventTree *tree = NULL;
1386: PetscSelfTimer *selftimers = NULL;
1387: int nTimers = 0, nstMax = 0;
1388: MPI_Comm comm;
1391: PetscObjectGetComm((PetscObject)viewer,&comm);
1392: PetscViewerInitASCII_XML(viewer);
1393: PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n");
1394: PetscViewerXMLStartSection(viewer, "petscroot", NULL);
1396: /* Get the total elapsed time, local and global maximum */
1397: PetscTime(&locTotalTime); locTotalTime -= petsc_BaseTime;
1398: MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1400: /* Print global information about this run */
1401: PetscPrintExeSpecs(viewer);
1402: PetscPrintGlobalPerformance(viewer, locTotalTime);
1403: /* Collect nested timer tree info from all processes */
1404: PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1405: PetscLogNestedTreePrintTop(viewer, tree, nTimers, globTotalTime);
1406: PetscLogNestedTreeDestroy(tree, nTimers);
1408: /* Calculate self-time for all (not-nested) events */
1409: PetscCalcSelfTime(viewer, &selftimers, &nstMax);
1410: PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime);
1411: PetscFree(selftimers);
1413: PetscViewerXMLEndSection(viewer, "petscroot");
1414: PetscViewerFinalASCII_XML(viewer);
1415: return(0);
1416: }
1418: /*
1419: * Get the name of a nested event.
1420: */
1421: static PetscErrorCode PetscGetNestedEventName(const PetscNestedEventTree *tree,int id,char **name)
1422: {
1423: PetscErrorCode ierr;
1424: PetscStageLog stageLog;
1427: PetscLogGetStageLog(&stageLog);
1428: *name = stageLog->eventLog->eventInfo[(PetscLogEvent)tree[id].nstEvent].name;
1429: return(0);
1430: }
1432: /*
1433: * Get the total time elapsed.
1434: */
1435: static PetscErrorCode PetscGetTotalTime(const PetscViewer viewer,PetscLogDouble *totalTime)
1436: {
1437: PetscErrorCode ierr;
1438: PetscLogDouble locTotalTime;
1439: MPI_Comm comm;
1442: PetscObjectGetComm((PetscObject)viewer,&comm);
1443: PetscTime(&locTotalTime);
1444: locTotalTime -= petsc_BaseTime;
1445: MPIU_Allreduce(&locTotalTime,totalTime,1,MPIU_PETSCLOGDOUBLE,MPI_MAX,comm);
1446: return(0);
1447: }
1449: /*
1450: * Write a line to the flame graph output and then recurse into child events.
1451: */
1452: static PetscErrorCode PetscLogNestedTreePrintFlamegraph(PetscViewer viewer,PetscNestedEventTree *tree,int nTimers,int iStart,PetscLogDouble totalTime,PetscIntStack eventStack)
1453: {
1454: int depth=tree[iStart].depth,parentCount=1,i,nChildren;
1455: char *name=NULL;
1456: PetscErrorCode ierr;
1457: PetscEventPerfInfo myPerfInfo={0},selfPerfInfo={0},otherPerfInfo={0};
1458: PetscLogDouble countsPerCall=0,locTime,globTime;
1459: PetscSortItem *children;
1460: PetscStageLog stageLog;
1461: MPI_Comm comm;
1464: PetscLogGetStageLog(&stageLog);
1465: PetscObjectGetComm((PetscObject)viewer,&comm);
1467: /* Determine information about the child events as well as 'self' and 'other' */
1468: PetscLogNestedTreeGetChildrenCount(tree,nTimers,iStart,depth,&nChildren);
1469: PetscLogNestedTreeSetChildrenSortItems(viewer,tree,nTimers,iStart,depth,nChildren,&children);
1470: PetscLogNestedTreeSetSelfOtherPerfInfo(tree,iStart,totalTime,children,nChildren,&myPerfInfo,&selfPerfInfo,&otherPerfInfo,&parentCount,&countsPerCall);
1472: /* Write line to the file. The time shown is 'self' + 'other' because each entry in the output
1473: * is the total time spent in the event minus the amount spent in child events. */
1474: locTime = selfPerfInfo.time + otherPerfInfo.time;
1475: MPIU_Allreduce(&locTime,&globTime,1,MPIU_PETSCLOGDOUBLE,MPI_MAX,comm);
1476: if (globTime/totalTime > THRESHOLD && tree[iStart].own) {
1477: /* Iterate over parent events in the stack and write them */
1478: for (i=0; i<=eventStack->top; i++) {
1479: PetscGetNestedEventName(tree,eventStack->stack[i],&name);
1480: PetscViewerASCIIPrintf(viewer,"%s;",name);
1481: }
1482: PetscGetNestedEventName(tree,iStart,&name);
1483: /* The output is given as an integer in microseconds because otherwise the file cannot be read
1484: * by apps such as speedscope (https://speedscope.app/). */
1485: PetscViewerASCIIPrintf(viewer,"%s %" PetscInt64_FMT "\n",name,(PetscInt64)(globTime*1e6));
1486: }
1488: /* Add the current event to the parent stack and write the child events */
1489: PetscIntStackPush(eventStack, iStart);
1490: for (i=0; i<nChildren; i++) {
1491: PetscLogNestedTreePrintFlamegraph(viewer,tree,nTimers,children[i].id,totalTime,eventStack);
1492: }
1493: /* Pop the top item from the stack and immediately discard it */
1494: {
1495: int tmp;
1496: PetscIntStackPop(eventStack, &tmp);
1497: }
1498: return(0);
1499: }
1501: /*
1502: * Print nested logging information to a file suitable for reading into a Flame Graph.
1503: *
1504: * The format consists of a semicolon-separated list of events and the event duration in microseconds (which must be an integer).
1505: * An example output would look like:
1506: * MatAssemblyBegin 1
1507: * MatAssemblyEnd 10
1508: * MatView 302
1509: * KSPSetUp 98
1510: * KSPSetUp;VecSet 5
1511: * KSPSolve 150
1512: *
1513: * This option may be requested from the command line by passing in the flag `-log_view :<somefile>.txt:ascii_flamegraph`.
1514: */
1515: PetscErrorCode PetscLogView_Flamegraph(PetscViewer viewer)
1516: {
1517: int nTimers=0,i,nChildren;
1518: PetscErrorCode ierr;
1519: PetscIntStack eventStack;
1520: PetscLogDouble totalTime;
1521: PetscNestedEventTree *tree=NULL;
1522: PetscSortItem *children;
1525: PetscGetTotalTime(viewer,&totalTime);
1526: PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1527: /* We use an integer stack to keep track of parent event IDs */
1528: PetscIntStackCreate(&eventStack);
1530: /* Initialize the child events and write them recursively */
1531: PetscLogNestedTreeGetChildrenCount(tree,nTimers,-1,0,&nChildren);
1532: PetscLogNestedTreeSetChildrenSortItems(viewer,tree,nTimers,-1,0,nChildren,&children);
1533: for (i=0; i<nChildren; i++) {
1534: PetscLogNestedTreePrintFlamegraph(viewer,tree,nTimers,children[i].id,totalTime,eventStack);
1535: }
1537: PetscLogNestedTreeDestroy(tree, nTimers);
1538: PetscIntStackDestroy(eventStack);
1539: return(0);
1540: }
1542: PETSC_EXTERN PetscErrorCode PetscASend(int count, int datatype)
1543: {
1544: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1546: #endif
1549: petsc_send_ct++;
1550: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1551: PetscMPITypeSize(count,MPI_Type_f2c((MPI_Fint) datatype),&petsc_send_len);
1552: #endif
1553: return(0);
1554: }
1556: PETSC_EXTERN PetscErrorCode PetscARecv(int count, int datatype)
1557: {
1558: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1560: #endif
1563: petsc_recv_ct++;
1564: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1565: PetscMPITypeSize(count,MPI_Type_f2c((MPI_Fint) datatype),&petsc_recv_len);
1566: #endif
1567: return(0);
1568: }
1570: PETSC_EXTERN PetscErrorCode PetscAReduce()
1571: {
1573: petsc_allreduce_ct++;
1574: return(0);
1575: }
1577: #endif