Skip to content

HPCC-32003 Automated testing of hyperlinks in Documentation files #10

HPCC-32003 Automated testing of hyperlinks in Documentation files

HPCC-32003 Automated testing of hyperlinks in Documentation files #10

Workflow file for this run

name: Test Hyperlinks
on:
pull_request:
branches:
- "master"
- "candidate-*"
- "!candidate-9.4.*"
- "!candidate-9.2.*"
- "!candidate-9.0.*"
- "!candidate-8.*"
- "!candidate-7.*"
- "!candidate-6.*"
# paths:
# - '**.md'
# - '**.rst'
# - '**.xml'
workflow_dispatch:
inputs:
full-scan:
description: 'Scan all files'
required: false
type: boolean
default: false
jobs:
main:
runs-on: ubuntu-22.04
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: List Docs
run: |
if [[ true ]]; then
find $PWD -name '*.xml' -type f > xmlFilesList.txt
find $PWD -name '*.md' -type f > mdFilesList.txt
find $PWD -name '*.rst' -type f > rstFilesList.txt
else
git diff --name-only HEAD^1 HEAD > changed_files.txt
cat changed_files.txt | grep -E "*.xml" > xmlFilesList.txt
cat changed_files.txt | grep -E "*.md" > mdFilesList.txt
cat changed_files.txt | grep -E "*.rst" > rstFilesList.txt
fi
- name: List links from Docs
run: |
IFS=$'\n'
for file in $( cat xmlFilesList.txt )
do
grep -onHE -e "<ulink" -e 'url="http[^\"\]+' -e "</ulink>" ${file} | sed 's/url="//' > links.tmp
flag=0
for line in $( cat links.tmp )
do
link=$( echo $line | cut -d ':' -f3- )
if [[ ${link:0:6} == '<ulink' ]]; then
flag=1
continue
elif [[ ${link:0:8} == '</ulink>' ]]; then
flag=0
continue
fi
if [[ $flag -eq 1 ]]; then
echo $line >> links_list.txt
else
echo $line >> rejected_links.txt
fi
done
done
for file in $( cat mdFilesList.txt )
do
grep -onHE -e "\]\([^\)]+" -e "\`\`\`[^\`]*" -e "http://[^\ \;\"\'\<\>\]\[\,\`\)]+" -e "https://[^\ \;\"\'\<\>\]\[\,\`\)]+" ${file} | sed 's/](//' > links.tmp
flag=0
for line in $( cat links.tmp )
do
link=$( echo $line | cut -d ':' -f3- )
if [[ ${link:0:3} == '```' ]]; then
flag=$(( 1 - flag ))
continue
fi
if [[ $flag -eq 0 ]]; then
echo $line >> links_list.txt
fi
done
done
for file in $( cat rstFilesList.txt )
do
grep -onHE -e ".. _[^\]+" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${file} | sed 's/.. _[^\:]*: //' >> links_list.txt
done
cat links_list.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -E 'https://|http://' > ExternalLinks.txt
cat links_list.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -vE 'https://|http://' > InternalLinks.txt
- name: Test External links
run: |
touch checkedLinksCache.txt
IFS=$'\n'
for line in $(cat ExternalLinks.txt )
do
link=$( echo $line | cut -d ':' -f3- )
link=${link%.} #removing trailing .
link=${link% } #removing trailing space
checkCache=$( cat checkedLinksCache.txt | grep "$link~" | wc -w )
try=3
if [[ $checkCache -eq 0 ]]; then
while [[ $try -ne 0 ]]
do
status_code=$(curl -LI -m 60 -s $link | grep "HTTP" | tail -1 | cut -d' ' -f2 )
if [[ -n $status_code ]]; then
echo "$link~$status_code" >> checkedLinksCache.txt
break
else
echo $line
echo "retrying..."
try=$(( try - 1))
fi
done
else
status_code=$( cat checkedLinksCache.txt | grep "$link~" | cut -d '~' -f2 )
fi
if [[ $status_code -eq 404 ]]; then
echo -e "${link} - \033[0;31m404 Error\033[0m"
echo "${line}" >> error-report.log
else
echo "${link} - ${status_code}"
fi
done
- name: Test Internal Links
run: |
for line in $( cat InternalLinks.txt )
do
reference=$( echo $line | cut -d ':' -f3- )
file=$( echo $line | cut -d ':' -f1 )
if [[ ${reference:0:1} == '#' ]]; then
Link_text=$( cat $file | grep -oE "\[.*\]\(${reference}\)" | sed 's/\[//' | cut -d ']' -f1 )
isPresent=$(cat $file | grep -oE "# ${Link_text}" | wc -w)
if [[ $isPresent -eq 0 ]]; then
echo "${line}" >> error-report.log
fi
else
if [[ ${reference:0:1} == '/' ]]; then
baseDir=$PWD
else
baseDir=${file/$( basename $file )}
fi
searchFile="$baseDir/${reference}"
searchFile=$( realpath $searchFile )
if [[ ! -f $searchFile ]]; then
echo "${line}" >> error-report.log
fi
fi
done
- name: report Error links
run: |
Number_of_404_links=$( cat error-report.log | wc -l )
echo -e "\u001b[32mNo. of files scanned : $( cat *FilesList.txt | wc -l )"
if [[ $Number_of_404_links -ne 0 ]]; then
echo -e "\u001b[31mNo. of unique Broken links : $( cat error-report.log | cut -d: -f3- | sort | uniq | wc -l )"
echo -e "\u001b[31mTotal No. of Broken links : $( cat error-report.log | cut -d: -f3- | sort | wc -l )"
exit -1
else
echo -e "\u001b[32mNo Broken-links found"
fi
- name: Modify Log file
if: ${{ failure() || cancelled() }}
run: |
baseDir=${PWD%$(basename $PWD)}
baseDir=$(echo $baseDir | sed 's/\//\\\//g')
sed -i "s/${baseDir}//g" error-report.log
fileNames=$(cat error-report.log | cut -d ':' -f1 | sort | uniq )
fileCount=1
for line in $fileNames
do
rawLines=$( cat error-report.log | grep $line | cut -d ':' -f2- )
echo "$fileCount. $line" >> error-reportTmp.log
fileCount=$(( fileCount + 1))
for rawLine in $rawLines
do
echo -e "\t Line $rawLine" | sed 's/:/ : /' >> error-reportTmp.log
done
done
cat error-reportTmp.log > error-report.log
- name: Upload logs
uses: actions/upload-artifact@v4
if: ${{ failure() || cancelled() }}
with:
name: Hyperlinks-testing-log
path: |
/home/runner/work/HPCC-Platform/HPCC-Platform/error-report.log
/home/runner/work/HPCC-Platform/HPCC-Platform/*FilesList.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/checkedLinksCache.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/*Links.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/links_list.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/changed_files.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/rejected_links.txt
if-no-files-found: ignore