Merge pull request networkupstools#2535 from jimklimov/issue-2510-ord…

…ering-mixedDates Fix `ChangeLog` content ordering (avoid interleaved dates) and build ordering (avoid potential conflicts and CPU resource waste in parallel builds)
jimklimov · Jul 16, 2024 · 1ebd733 · 1ebd733
2 parents 5740508 + 94991ea
commit 1ebd733
Show file tree

Hide file tree

Showing 5 changed files with 139 additions and 16 deletions.
diff --git a/Makefile.am b/Makefile.am
@@ -359,7 +359,11 @@ endif
 # repo configuration, or submodules). But this is a Git-crawling target
 # anyway, and in the worst case (Git's design changes) we would spend a
 # bit of time researching the FS in vain, and go on to re-generate the
-# ChangeLog when maybe we should not have - oh well:
+# ChangeLog when maybe we should not have - oh well.
+# WARNING: The CHANGELOG_REQUIRE_GROUP_BY_DATE_AUTHOR=true mode here is
+# default to allow for prettier documentation, but it can require too much
+# memory for weaker build systems. Set it to false when calling make there.
+CHANGELOG_REQUIRE_GROUP_BY_DATE_AUTHOR_ENVVAR = true
 $(abs_top_builddir)/ChangeLog: tools/gitlog2changelog.py dummy-stamp
 	@cd $(abs_top_srcdir) && \
 	    if test -e .git ; then \
@@ -369,7 +373,9 @@ $(abs_top_builddir)/ChangeLog: tools/gitlog2changelog.py dummy-stamp
 	            echo "Using still-valid ChangeLog file generated earlier from same revision of Git source metadata in '$${NUT_GITDIR}'" >&2 ; \
 	        else \
 	            echo "  DOC-CHANGELOG-GENERATE        $@" ; \
-	            CHANGELOG_FILE="$@" $(WITH_PDF_NONASCII_TITLES_ENVVAR) $(abs_top_builddir)/tools/gitlog2changelog.py $(GITLOG_START_POINT) || { \
+	            CHANGELOG_FILE="$@" $(WITH_PDF_NONASCII_TITLES_ENVVAR) \
+	            CHANGELOG_REQUIRE_GROUP_BY_DATE_AUTHOR="$(CHANGELOG_REQUIRE_GROUP_BY_DATE_AUTHOR_ENVVAR)" \
+	              $(abs_top_builddir)/tools/gitlog2changelog.py $(GITLOG_START_POINT) || { \
 	                echo "  DOC-CHANGELOG-GENERATE        $@ : FAILED (non-fatal)" >&2 ; \
 	                printf "gitlog2changelog.py failed to generate the ChangeLog.\n\nNOTE: See https://github.com/networkupstools/nut/commits/master for change history.\n\n" > "$@" ; \
 	            } ; \

diff --git a/NEWS.adoc b/NEWS.adoc
@@ -213,6 +213,15 @@ during a NUT build.
  - added a `make distcheck-light-man` recipe to require verification that the
    manual page files can be built using the prepared "tarball" archive. [#2473]
 
+ - revised the documentation building recipes, with the goal to avoid building
+   the `ChangeLog` products and their intermediate files more than once (but
+   still react to `git` metadata changes during development), and to sanity
+   check the resulting final document (currently only for `html-single` mode).
+   As part of this, the `CHANGELOG_REQUIRE_GROUP_BY_DATE_AUTHOR` setting was
+   added (for `make` calls and used by `tools/gitlog2changelog.py.in` script),
+   and it defaults to `true` allowing for better ordered documents at the cost
+   of some memory during document generation. [#2510]
+
  - added a `common/Makefile.am` build product for a new internal library
    `libcommonstr.la` which allows a smaller selection of helper methods
    for tools like `nut-scanner` which do not need the full `libcommon.la`

diff --git a/UPGRADING.adoc b/UPGRADING.adoc
@@ -38,6 +38,13 @@ Changes from 2.8.2 to 2.8.3
   pages which are delivered automatically. Packaging recipes can likely
   be simplified now. [#2445]
 
+- A `CHANGELOG_REQUIRE_GROUP_BY_DATE_AUTHOR` setting was added (for `make`
+  calls and used by `tools/gitlog2changelog.py.in` script), and it defaults
+  to `true` allowing for better ordered documents at the cost of some memory
+  during document generation. Resource-constrained builders (working from
+  a Git workspace, not tarball archives) may have to set it to `false` when
+  calling `make` for NUT. [#2510]
+
 - NUT products like `nut-scanner`, which dynamically load shared libraries
   at run-time without persistent pre-linking, should now know the library
   file names that were present during build (likely encumbered with version

diff --git a/docs/Makefile.am b/docs/Makefile.am
@@ -18,7 +18,20 @@
 MAINTAINERCLEANFILES = Makefile.in .dirstamp
 EXTRA_DIST =
 
-all: doc
+# Note: "doc" ensures the `configure`-specified list of documents we actually
+# want, while the default generated "all: all-am" target historically causes
+# some but not all of these targets to get built (e.g. ChangeLog html/pdf is
+# usually not made there). Post-processing "doc" as part of "all" helps
+# ensure that we do not rebuild stuff in vain during parallel builds (where
+# "all-am" and "doc" would be unordered parallel goals of the "all" target)
+# while getting those further goals achieved eventually in the default build.
+# Crucially, this allows to make sure "ChangeLog(.adoc*)" files have been
+# generated once (can take a looong while), settled into place, and only then
+# we revisit them for html/pdf rendering (another long while) without randomly
+# confusing the system with new timestamps and needless regenerations later on.
+all:
+	@echo "  DOC-FOLLOW-UP	Basic 'make $@' in `pwd` is done, following up with 'make doc' to ensure complex document types"
+	+@$(MAKE) $(AM_MAKEFLAGS) doc
 
 # Is "egrep == grep -E" always valid? (maybe all a job for configure.ac)
 #EGREP = egrep
@@ -256,9 +269,21 @@ ChangeLog.html-contentchecked:
 	            fi ; \
 	        fi ; \
 	    fi; \
-	 else \
+	    if [ x"$$FAILED" = x ] ; then \
+	        echo "PASSED $@" >&2 ; \
+	        exit 0 ; \
+	    fi ; \
+	    if [ x"$$FAILED" != x ] && [ -s '$(top_builddir)/ChangeLog.adoc' ] \
+	    && [ "`head -1 $(top_builddir)/ChangeLog.adoc`" = "=== Failed to generate the ChangeLog" ] \
+	    ; then \
+	        FAILED="" ; \
+	    fi; \
+	 fi; \
+	 if [ x"$$FAILED" = x ] ; then \
 	    echo "SKIPPED $@ because input files were not available" >&2 ; \
-	 fi
+	    exit 0 ; \
+	 fi ; \
+	 exit 1
 
 check-html-single: $(ASCIIDOC_HTML_SINGLE)
 	+@FAILED=""; LANG=C; LC_ALL=C; export LANG; export LC_ALL; \
@@ -332,6 +357,8 @@ DOCBUILD_CONVERT_GITHUB_LINKS = { \
 .adoc.adoc-parsed:
 	@$(DOCBUILD_CONVERT_GITHUB_LINKS)
 
+$(top_builddir)/ChangeLog.adoc-parsed: $(top_builddir)/ChangeLog.adoc
+
 dummy:
 $(top_builddir)/ChangeLog: dummy
 	@+echo "  DOC-CHANGELOG-GENERATE-WRAPPER	$@ : call parent Makefile to decide if (re-)generation is needed" \
@@ -347,6 +374,14 @@ else !WITH_PDF_NONASCII_TITLES
 A2X_ASCII_IDS = ":ascii-ids:\n"
 endif !WITH_PDF_NONASCII_TITLES
 
+# Probably due to the web of makefiles and an overwhelmed job server in some
+# implementations, during parallel builds we can end up scheduling several
+# threads creating this asciidoc (and adoc-parsed later). This step only
+# costs a few seconds, however the updated timestamp may cause new HTML/PDF
+# builds which cost a lot more. Below we try a few ways to detect a build
+# already running and bail out early if the file exists. Otherwise we bite
+# the bullet and spend a few seconds, and then re-check if another thread
+# did exist and finished first.
 $(top_builddir)/ChangeLog.adoc: $(top_builddir)/ChangeLog
 	@INPUT="$?"; \
 	 test -n "$${INPUT}" || INPUT="$$(top_builddir)/ChangeLog" ; \
@@ -357,6 +392,8 @@ $(top_builddir)/ChangeLog.adoc: $(top_builddir)/ChangeLog
 	     test -n "$@" && { printf '=== Failed to generate the ChangeLog\n\n%s\n\nNOTE: See https://github.com/networkupstools/nut/commits/master for change history.\n\n' "$${MSG}" > "$@" ; } ; \
 	     exit ; \
 	 } ; \
+	 W=10 ; while [ "$${W}" -gt 0 ] && find '[email protected].'* '$@' -newer "$${INPUT}" 2>/dev/null >/dev/null ; do sleep 1 ; W="`expr $$W - 1`"; done ; touch "$@.tmp.$$$$"; \
+	 if [ x"`find '$@' -newer "$${INPUT}" 2>/dev/null`" != x ] ; then echo "  DOC-CHANGELOG-ASCIIDOC	$${INPUT} => $@ : SKIP (keep existing)"; rm -f "$@.tmp.$$$$"; exit 0 ; fi ; \
 	 echo "  DOC-CHANGELOG-ASCIIDOC	$${INPUT} => $@" \
 	 && printf "ifdef::txt[]\n== Very detailed Change Log\n"$(A2X_ASCII_IDS)"endif::txt[]\n\n" > "$@.tmp.$$$$" \
 	 && TABCHAR="`printf '\t'`" \
@@ -370,7 +407,8 @@ $(top_builddir)/ChangeLog.adoc: $(top_builddir)/ChangeLog
 	        -e 's,\[\[\([^]]*\)\]\],[\1],g' \
 	        -e 's,^\(\s\s*\)\([0-9]\),\1{empty}\2,g' \
 	   < "$${INPUT}" >> "$@.tmp.$$$$" \
-	 && mv -f "$@.tmp.$$$$" "$@"
+	 && if [ x"`find '$@' -newer "$${INPUT}" 2>/dev/null`" != x ] ; then echo "  DOC-CHANGELOG-ASCIIDOC	$${INPUT} => $@ : SKIP (keep recently born competitor)"; rm -f "$@.tmp.$$$$"; \
+	    else mv -f "$@.tmp.$$$$" "$@" ; fi
 
 # Add other directory deps (not for local EXTRA_DIST) and generated contents
 FULL_USER_MANUAL_DEPS = $(USER_MANUAL_DEPS) $(SHARED_DEPS) \

diff --git a/tools/gitlog2changelog.py.in b/tools/gitlog2changelog.py.in
@@ -89,6 +89,31 @@ else:
     else:
         fout = open(CHANGELOG_FILE, "w")
 
+# By default we collect information from a commit and output it as soon as
+# we have enough. Part of it is best-effort grouping of a series of commits
+# made by the same author on the same day, if they follow each other.
+# The alternative is to expend memory to collect all git log entries into a
+# dictionary first (key = date+author, value = list of entries) and only
+# print the output in the end of processing. This costs more resources, so
+# is not default behavior.
+requireGroupByDateAuthor = False
+try:
+    tmpEnvVar = os.environ.get("CHANGELOG_REQUIRE_GROUP_BY_DATE_AUTHOR", None)
+    if str(tmpEnvVar).lower() == "true":
+        requireGroupByDateAuthor = True
+except Exception as ignored:
+    pass
+
+cachedContent = None
+if requireGroupByDateAuthor:
+    try:
+        from collections import defaultdict
+        cachedContent = defaultdict(list)
+    except Exception as x:
+        print("Failed to init requireGroupByDateAuthor processing as defaultdict(list), trying simple dict(): " + str(x))
+        requireGroupByDateAuthor = False
+        cachedContent = dict()
+
 # Set up the loop variables in order to locate the blocks we want
 authorFound = False
 dateFound = False
@@ -162,7 +187,14 @@ for line in fin:
         continue
     # Extract the actual commit message for this commit
     elif authorFound and dateFound and messageFound is False:
-        # Find the commit message if we can
+        # Find the commit message if we can (including the optional
+        # details after the title and a blank line)
+        # FIXME: Detect end of message by /^#/ to allow for longer essays
+        #  in the detailed comments part?
+        # FIXME: Some such comments include asciidoc-ish markup, notably
+        #  bullet lists - do not concatenate those into one block but do
+        #  actually pass them as sub-lists (indented, and perhaps not
+        #  starting with an asterisk which we use for this document).
         if len(line) == fin_chop:
             if messageNL:
                 messageFound = True
@@ -187,21 +219,33 @@ for line in fin:
                 files = files + ", " + fileList[0].strip()
             else:
                 files = fileList[0].strip()
+
     # All of the parts of the commit have been found - write out the entry
     if authorFound and dateFound and messageFound and filesFound:
         # First the author line, only outputted if it is the first for that
-        # author on this day
+        # author on this day.
+        # WARNING: In case of git rebase commit shuffling, merges of dormant
+        # branches, etc. we are not guaranteed to have all dates in the list
+        # nicely ordered. In fact, the same date+author can be repeated if
+        # there were commits with other metadata in git history between those
+        # (e.g. many PRs from a few authors merged during one day). While we
+        # could cache each section by authorLine and only output in the end,
+        # it can require a lot of memory - so by default we do not.
         authorLine = date + "  " + author
-        if len(prevAuthorLine) == 0:
-            fout.write(authorLine + "\n\n")
-        elif authorLine == prevAuthorLine:
-            pass
+        if requireGroupByDateAuthor:
+            if authorLine not in cachedContent:
+                cachedContent[authorLine] = list()
         else:
-            fout.write("\n" + authorLine + "\n\n")
+            if len(prevAuthorLine) == 0:
+                fout.write(authorLine + "\n\n")
+            elif authorLine == prevAuthorLine:
+                pass
+            else:
+                fout.write("\n" + authorLine + "\n\n")
 
         # Assemble the actual commit message line(s) and limit the line length
         # to 80 characters.
-        # Avoid printing same (or equivalen) filename lists twice, if commit
+        # Avoid printing same (or equivalent) filename lists twice, if commit
         # message starts with them.
         if message.startswith(files + ":"):
             commitLine = "* " + message
@@ -219,8 +263,11 @@ for line in fin:
             else:
                 commitLine = "* " + files + ": " + message
 
-        # Write out the commit line
-        fout.write(wrapper.fill(commitLine) + "\n")
+        if requireGroupByDateAuthor:
+            cachedContent[authorLine].append(commitLine)
+        else:
+            # Write out the commit line, wrapped for length
+            fout.write(wrapper.fill(commitLine) + "\n")
 
         # Now reset all the variables ready for a new commit block.
         authorFound = False
@@ -232,6 +279,22 @@ for line in fin:
         files = ""
         prevAuthorLine = authorLine
 
+if requireGroupByDateAuthor:
+    # We did not print anything before, flush it out now;
+    # most recent date first (alphanumerically reverse)
+    counter = 0
+    for authorLine in sorted(cachedContent, reverse=True):
+        if counter == 0:
+            fout.write(authorLine + "\n\n")
+        else:
+            fout.write("\n" + authorLine + "\n\n")
+
+        # Use original list append order
+        for commitLine in cachedContent[authorLine]:
+            fout.write(wrapper.fill(commitLine) + "\n")
+
+        counter = counter + 1
+
 # Close the input and output lines now that we are finished.
 if fin_mode == 3:
     p.stdout.close()