From 66c07020601ada16dcbc1d2107a741653652ffd2 Mon Sep 17 00:00:00 2001 From: Markus Neteler Date: Tue, 12 Nov 2024 23:47:35 +0100 Subject: [PATCH] cronjobs: inject canonical URLs into older manual pages (SEO) The GRASS GIS manual pages of the different versions have been published for a long time with a difficult to understand concept of being invisible, redirected or shown, which also strongly affects the search engine ranking. SEO: Without indication of "canonical" URLs different versions wipe each out out in search engines. Canonical tags help consolidate duplicate or similar content by specifying the preferred version of a page, ensuring search engines index and rank the desired URL while avoiding duplicate content issues. This PR changes the cronjob scripts to - inject "grass-stable" as the "canonical" into older manual pages under versioned URL - inject "grass-devel" as the "canonical" into the development manual pages under versioned URL Like this no "duplicate content" from a SEO perspective should occur. Also `robots.txt` is updated to reactivate the manual pages of old GRASS GIS versions (which now contain "grass-stable" as the canonical). Fixes https://github.com/OSGeo/grass/issues/4579 --- ...ron_grass_current_stable_build_binaries.sh | 39 +++++++++++++++++-- .../cron_grass_legacy_build_binaries.sh | 15 ++++--- .../cron_grass_old_build_binaries.sh | 14 +++---- .../cron_grass_preview_build_binaries.sh | 36 +++++++++++++++-- utils/cronjobs_osgeo_lxd/robots.txt | 26 ++++++------- 5 files changed, 96 insertions(+), 34 deletions(-) diff --git a/utils/cronjobs_osgeo_lxd/cron_grass_current_stable_build_binaries.sh b/utils/cronjobs_osgeo_lxd/cron_grass_current_stable_build_binaries.sh index 994ead9aa6..805282742b 100755 --- a/utils/cronjobs_osgeo_lxd/cron_grass_current_stable_build_binaries.sh +++ b/utils/cronjobs_osgeo_lxd/cron_grass_current_stable_build_binaries.sh @@ -14,6 +14,8 @@ # - generates the pyGRASS 8 HTML manual # - generates the user 8 HTML manuals # - injects DuckDuckGo search field +# - copies over generated manual pages to grass-stable/manuals/ +# - injects in versioned manual the "canonical" to point to "stable" manual (as seen in the Python manual pages) # Preparations, on server (neteler@grasslxd:$): # - install dependencies: @@ -325,12 +327,42 @@ export VERSION_NUMBER=$DOTVERSION python3 $GRASSBUILDDIR/man/build_keywords.py $TARGETMAIN/grass$GMAJOR$GMINOR/manuals/ $TARGETMAIN/grass$GMAJOR$GMINOR/manuals/addons/ unset ARCH ARCH_DISTDIR GISBASE VERSION_NUMBER +############################################ +# Cloning new manual pages into grass-stable/manuals/ (following the Python manual pages concept) +# - inject canonical URL therein to point to versioned manual page (avoiding "duplicate content" SEO punishment) +# see https://developers.google.com/search/docs/crawling-indexing/consolidate-duplicate-urls + +TARGETHTMLDIRSTABLE=$TARGETMAIN/grass-stable/manuals/ +mkdir -p $TARGETHTMLDIRSTABLE $TARGETHTMLDIRSTABLE/addons +# cleanup from previous run +rm -rf /tmp/addons +\mv $TARGETHTMLDIRSTABLE/addons /tmp +rm -f $TARGETHTMLDIRSTABLE/*.* +(cd $TARGETHTMLDIRSTABLE ; rm -rf barscales colortables icons northarrows) +# clone manual pages +cp -rp $TARGETHTMLDIR/* $TARGETHTMLDIRSTABLE/ + +############################################ +# SEO: inject canonical link into versioned manual pages (e.g, grass84/) +# - cd back into folder of versioned HTML manual pages +# - run sed to replace an existing HTML header string in the upper part of the HTML file +# with itself + canonical link of stable version +# --> do this for core manual pages, addons, libpython +(cd $TARGETHTMLDIR/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) +(cd $TARGETHTMLDIR/addons/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) +(cd $TARGETHTMLDIR/libpython/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) + ############################################ # create sitemaps to expand the hugo sitemap +# versioned manual: python3 $HOME/src/grass$GMAJOR-addons/utils/create_manuals_sitemap.py --dir=/var/www/code_and_data/grass$GMAJOR$GMINOR/manuals/ --url=https://grass.osgeo.org/grass$GMAJOR$GMINOR/manuals/ -o python3 $HOME/src/grass$GMAJOR-addons/utils/create_manuals_sitemap.py --dir=/var/www/code_and_data/grass$GMAJOR$GMINOR/manuals/addons/ --url=https://grass.osgeo.org/grass$GMAJOR$GMINOR/manuals/addons/ -o +# stable manual: +python3 $HOME/src/grass$GMAJOR-addons/utils/create_manuals_sitemap.py --dir=/var/www/code_and_data/grass-stable/manuals/ --url=https://grass.osgeo.org/grass-stable/manuals/ -o +python3 $HOME/src/grass$GMAJOR-addons/utils/create_manuals_sitemap.py --dir=/var/www/code_and_data/grass-stable/manuals/addons/ --url=https://grass.osgeo.org/grass-stable/manuals/addons/ -o + ############################################ # cleanup cd $GRASSBUILDDIR @@ -339,9 +371,10 @@ rm -rf lib/html/ lib/latex/ /tmp/addons echo "Finished GRASS $VERSION $ARCH compilation." echo "Written to: $TARGETDIR" -echo "Copied HTML ${GVERSION} manual to https://grass.osgeo.org/grass${VERSION}/manuals/" -echo "Copied pygrass progman ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/libpython/" -echo "Copied Addons ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/addons/" +echo "Copied HTML ${GVERSION} manual to https://grass.osgeo.org/grass${VERSION}/manuals/ (with canonical in metadata)" +echo "Copied pygrass progman ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/libpython/ (with canonical in metadata)" +echo "Copied Addons ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/addons/ (with canonical in metadata)" ## echo "Copied HTML ${GVERSION} progman to https://grass.osgeo.org/programming${GVERSION}" +echo "Copied HTML stable manual to https://grass.osgeo.org/grass-stable/manuals/" exit 0 diff --git a/utils/cronjobs_osgeo_lxd/cron_grass_legacy_build_binaries.sh b/utils/cronjobs_osgeo_lxd/cron_grass_legacy_build_binaries.sh index cd319fa561..3d3ba44586 100755 --- a/utils/cronjobs_osgeo_lxd/cron_grass_legacy_build_binaries.sh +++ b/utils/cronjobs_osgeo_lxd/cron_grass_legacy_build_binaries.sh @@ -15,7 +15,7 @@ # - generates the user 7 HTML manuals # - injects DuckDuckGo search field # - injects "G8.x is the new version" box into core and addon manual pages -# - injects canonical URL +# - injects in versioned manual the "canonical" to point to "stable" manual (as seen in the Python manual pages) # Preparations, on server (neteler@grasslxd:$): # - install dependencies: @@ -320,10 +320,9 @@ echo "Injecting G8.x new current version hint in a red box into MAN pages..." # - run sed to replace an existing HTML header string in the upper part of the HTML file # with itself + canonical link of stable version # --> do this for core manual pages, addons, libpython -(cd $TARGETHTMLDIR/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) -(cd $TARGETHTMLDIR/addons/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) -(cd $TARGETHTMLDIR/libpython/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) - +(cd $TARGETHTMLDIR/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) +(cd $TARGETHTMLDIR/addons/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) +(cd $TARGETHTMLDIR/libpython/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) ############################################ # create sitemaps to expand the hugo sitemap @@ -339,8 +338,8 @@ rm -rf lib/html/ lib/latex/ /tmp/addons echo "Finished GRASS $VERSION $ARCH compilation." echo "Written to: $TARGETDIR" -echo "Copied HTML ${GVERSION} manual to https://grass.osgeo.org/grass${VERSION}/manuals/" -echo "Copied pygrass progman ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/libpython/" -echo "Copied Addons ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/addons/" +echo "Copied HTML ${GVERSION} manual to https://grass.osgeo.org/grass${VERSION}/manuals/ (with canonical in metadata)" +echo "Copied pygrass progman ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/libpython/ (with canonical in metadata)" +echo "Copied Addons ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/addons/ (with canonical in metadata)" exit 0 diff --git a/utils/cronjobs_osgeo_lxd/cron_grass_old_build_binaries.sh b/utils/cronjobs_osgeo_lxd/cron_grass_old_build_binaries.sh index c0144a3587..c6cea5b203 100755 --- a/utils/cronjobs_osgeo_lxd/cron_grass_old_build_binaries.sh +++ b/utils/cronjobs_osgeo_lxd/cron_grass_old_build_binaries.sh @@ -15,7 +15,7 @@ # - generates the user 8 HTML manuals # - injects DuckDuckGo search field # - injects "G8.x is the new version" box into core and addon manual pages -# - injects canonical URL +# - injects in versioned manual the "canonical" to point to "stable" manual (as seen in the Python manual pages) # Preparations, on server (neteler@grasslxd:$): # - install dependencies: @@ -352,9 +352,9 @@ echo "Injecting G8.x new current version hint in a red box into MAN pages..." # - run sed to replace an existing HTML header string in the upper part of the HTML file # with itself + canonical link of stable version # --> do this for core manual pages, addons, libpython -(cd $TARGETHTMLDIR/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) -(cd $TARGETHTMLDIR/addons/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) -(cd $TARGETHTMLDIR/libpython/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) +(cd $TARGETHTMLDIR/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) +(cd $TARGETHTMLDIR/addons/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) +(cd $TARGETHTMLDIR/libpython/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) ############################################ # create sitemaps to expand the hugo sitemap @@ -370,9 +370,9 @@ rm -rf lib/html/ lib/latex/ /tmp/addons echo "Finished GRASS $VERSION $ARCH compilation." echo "Written to: $TARGETDIR" -echo "Copied HTML ${GVERSION} manual to https://grass.osgeo.org/grass${VERSION}/manuals/" -echo "Copied pygrass progman ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/libpython/" -echo "Copied Addons ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/addons/" +echo "Copied HTML ${GVERSION} manual to https://grass.osgeo.org/grass${VERSION}/manuals/ (with canonical in metadata)" +echo "Copied pygrass progman ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/libpython/ (with canonical in metadata)" +echo "Copied Addons ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/addons/ (with canonical in metadata)" ## echo "Copied HTML ${GVERSION} progman to https://grass.osgeo.org/programming${GVERSION}" exit 0 diff --git a/utils/cronjobs_osgeo_lxd/cron_grass_preview_build_binaries.sh b/utils/cronjobs_osgeo_lxd/cron_grass_preview_build_binaries.sh index 4028902b89..79b89bd65c 100755 --- a/utils/cronjobs_osgeo_lxd/cron_grass_preview_build_binaries.sh +++ b/utils/cronjobs_osgeo_lxd/cron_grass_preview_build_binaries.sh @@ -15,6 +15,8 @@ # - generates the pyGRASS 8 HTML manual # - generates the user 8 HTML manuals # - injects DuckDuckGo search field +# - copies over generated manual pages to grass-devel/manuals/ +# - injects in versioned manual the "canonical" to point to "devel" manual (as seen in the Python manual pages) # Preparations, on server (neteler@grasslxd:$): @@ -327,6 +329,30 @@ export GISBASE=$ARCH_DISTDIR export VERSION_NUMBER=$DOTVERSION python3 $GRASSBUILDDIR/man/build_keywords.py $TARGETMAIN/grass$GMAJOR$GMINOR/manuals/ $TARGETMAIN/grass$GMAJOR$GMINOR/manuals/addons/ unset ARCH ARCH_DISTDIR GISBASE VERSION_NUMBER +############################################ +# Cloning new manual pages into grass-devel/manuals/ (following the Python manual pages concept) +# - inject canonical URL therein to point to versioned manual page (avoiding "duplicate content" SEO punishment) +# see https://developers.google.com/search/docs/crawling-indexing/consolidate-duplicate-urls + +TARGETHTMLDIRDEVEL=$TARGETMAIN/grass-devel/manuals/ +mkdir -p $TARGETHTMLDIRDEVEL $TARGETHTMLDIRDEVEL/addons +# cleanup from previous run +rm -rf /tmp/addons +\mv $TARGETHTMLDIRDEVEL/addons /tmp +rm -f $TARGETHTMLDIRDEVEL/*.* +(cd $TARGETHTMLDIRDEVEL ; rm -rf barscales colortables icons northarrows) +# clone manual pages +cp -rp $TARGETHTMLDIR/* $TARGETHTMLDIRDEVEL/ + +############################################ +# SEO: inject canonical link into versioned manual pages (e.g, grass85/) +# - cd back into folder of versioned HTML manual pages +# - run sed to replace an existing HTML header string in the upper part of the HTML file +# with itself + canonical link of devel version +# --> do this for core manual pages, addons, libpython +(cd $TARGETHTMLDIR/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) +(cd $TARGETHTMLDIR/addons/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) +(cd $TARGETHTMLDIR/libpython/ ; for myfile in `grep -L 'link rel="canonical"' *.html` ; do sed -i -e "s::\n:g" $myfile ; done) ############################################ # create sitemaps to expand the hugo sitemap @@ -334,6 +360,9 @@ unset ARCH ARCH_DISTDIR GISBASE VERSION_NUMBER python3 $HOME/src/grass$GMAJOR-addons/utils/create_manuals_sitemap.py --dir=/var/www/code_and_data/grass$GMAJOR$GMINOR/manuals/ --url=https://grass.osgeo.org/grass$GMAJOR$GMINOR/manuals/ -o python3 $HOME/src/grass$GMAJOR-addons/utils/create_manuals_sitemap.py --dir=/var/www/code_and_data/grass$GMAJOR$GMINOR/manuals/addons/ --url=https://grass.osgeo.org/grass$GMAJOR$GMINOR/manuals/addons/ -o +python3 $HOME/src/grass$GMAJOR-addons/utils/create_manuals_sitemap.py --dir=/var/www/code_and_data/grass-devel/manuals/ --url=https://grass.osgeo.org/grass-devel/manuals/ -o +python3 $HOME/src/grass$GMAJOR-addons/utils/create_manuals_sitemap.py --dir=/var/www/code_and_data/grass-devel/manuals/addons/ --url=https://grass.osgeo.org/grass-devel/manuals/addons/ -o + ############################################ # cleanup cd $GRASSBUILDDIR @@ -342,9 +371,10 @@ rm -rf lib/html/ lib/latex/ /tmp/addons echo "Finished GRASS $VERSION $ARCH compilation." echo "Written to: $TARGETDIR" -echo "Copied HTML ${GVERSION} manual to https://grass.osgeo.org/grass${VERSION}/manuals/" -echo "Copied pygrass progman ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/libpython/" -echo "Copied Addons ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/addons/" +echo "Copied HTML ${GVERSION} manual to https://grass.osgeo.org/grass${VERSION}/manuals/ (with canonical in metadata)" +echo "Copied pygrass progman ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/libpython/ (with canonical in metadata)" +echo "Copied Addons ${GVERSION} to https://grass.osgeo.org/grass${VERSION}/manuals/addons/ (with canonical in metadata)" echo "Copied HTML ${GVERSION} progman to https://grass.osgeo.org/programming${GVERSION}" +echo "Copied HTML devel manual to https://grass.osgeo.org/grass-devel/manuals/" exit 0 diff --git a/utils/cronjobs_osgeo_lxd/robots.txt b/utils/cronjobs_osgeo_lxd/robots.txt index 94447a18e1..804aed9b1c 100644 --- a/utils/cronjobs_osgeo_lxd/robots.txt +++ b/utils/cronjobs_osgeo_lxd/robots.txt @@ -11,30 +11,30 @@ Disallow: /stats/ Disallow: /gdp/grassmanuals/ Disallow: /gdp/html_grass4/ Disallow: /gdp/html_grass5/ + Disallow: /grass51/manuals/ Disallow: /grass5/manuals/html53_user/ -# SEO: we inject canonical link in all (old) manual pages to point to latest stable (avoid duplicate content SEO punishment) -# -> allow crawling of even GRASS GIS versions -# see cron_grass7_relbranch_build_binaries.sh -# (older versions have been manually tweaked) +Disallow: /grass54/manuals/ Disallow: /grass57/ + +# SEO note: we have injected canonical link in all (old) manual pages to point +# to grass-stable (this avoids "duplicate content" SEO punishment) +# Only odd, undesired versions are disallowed here: +Disallow: /grass60/ Disallow: /grass61/ +Disallow: /grass62/ Disallow: /grass63/ -Disallow: /grass65/ + Disallow: /grass71/ + Disallow: /grass73/ Disallow: /grass75/ +Disallow: /grass76/ Disallow: /grass77/ + Disallow: /grass79/ -Disallow: /grass81/ +Disallow: /grass81/ Sitemap: https://grass.osgeo.org/sitemap.xml Sitemap: https://grass.osgeo.org/sitemap_hugo.xml -## bring SEO back and use numbers -#Sitemap: https://grass.osgeo.org/grass-stable/manuals/sitemap_manuals.xml -Sitemap: https://grass.osgeo.org/grass82/manuals/sitemap_manuals.xml -#Sitemap: https://grass.osgeo.org/grass-stable/manuals/addons/sitemap_manuals.xml -Sitemap: https://grass.osgeo.org/grass82/manuals/addons/sitemap_manuals.xml -#Sitemap: https://grass.osgeo.org/grass-devel/manuals/sitemap_manuals.xml -Sitemap: https://grass.osgeo.org/grass83/manuals/sitemap_manuals.xml