HPCC-32003 Automated testing of hyperlinks in Documentation files #12
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Test Hyperlinks | |
on: | |
pull_request: | |
branches: | |
- "master" | |
- "candidate-*" | |
- "!candidate-9.4.*" | |
- "!candidate-9.2.*" | |
- "!candidate-9.0.*" | |
- "!candidate-8.*" | |
- "!candidate-7.*" | |
- "!candidate-6.*" | |
# paths: | |
# - '**.md' | |
# - '**.rst' | |
# - '**.xml' | |
workflow_dispatch: | |
jobs: | |
main: | |
runs-on: ubuntu-22.04 | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- name: List Docs | |
run: | | |
# if [[ ${{ github.event_name }} == "workflow_dispatch" ]]; then | |
if [[ true ]]; then | |
find $PWD -name '*.xml' -type f > xmlFilesList.txt | |
find $PWD -name '*.md' -type f > mdFilesList.txt | |
find $PWD -name '*.rst' -type f > rstFilesList.txt | |
else | |
git diff --name-only HEAD^1 HEAD > changed_files.txt | |
cat changed_files.txt | grep -E "*.xml" > xmlFilesList.txt | |
cat changed_files.txt | grep -E "*.md" > mdFilesList.txt | |
cat changed_files.txt | grep -E "*.rst" > rstFilesList.txt | |
fi | |
- name: List links from Docs | |
run: | | |
IFS=$'\n' | |
for file in $( cat xmlFilesList.txt ) | |
do | |
if [[ ! -f $file ]]; then | |
echo -e "$file -\u001b[31m file missing" | |
echo $file >> missingFiles.txt | |
continue | |
fi | |
grep -onHE -e "<ulink" -e 'url="http[^\"\]+' -e "</ulink>" ${file} | sed 's/url="//' > links.tmp | |
flag=0 | |
for line in $( cat links.tmp ) | |
do | |
link=$( echo $line | cut -d ':' -f3- ) | |
if [[ ${link:0:6} == '<ulink' ]]; then | |
flag=1 | |
continue | |
elif [[ ${link:0:8} == '</ulink>' ]]; then | |
flag=0 | |
continue | |
fi | |
if [[ $flag -eq 1 ]]; then | |
echo $line >> links_list.txt | |
else | |
echo $line >> rejected_links.txt | |
fi | |
done | |
done | |
for file in $( cat mdFilesList.txt ) | |
do | |
if [[ ! -f $file ]]; then | |
echo -e "$file -\u001b[31m file missing" | |
echo $file >> missingFiles.txt | |
continue | |
fi | |
grep -onHE -e "\]\([^\)]+" -e "\`\`\`[^\`]*" -e "http://[^\ \;\"\'\<\>\]\[\,\`\)]+" -e "https://[^\ \;\"\'\<\>\]\[\,\`\)]+" ${file} | sed 's/](//' > links.tmp | |
flag=0 | |
for line in $( cat links.tmp ) | |
do | |
link=$( echo $line | cut -d ':' -f3- ) | |
if [[ ${link:0:3} == '```' ]]; then | |
flag=$(( 1 - flag )) | |
continue | |
fi | |
if [[ $flag -eq 0 ]]; then | |
echo $line >> links_list.txt | |
fi | |
done | |
done | |
for file in $( cat rstFilesList.txt ) | |
do | |
if [[ ! -f $file ]]; then | |
echo -e "$file -\u001b[31m file missing" | |
echo $file >> missingFiles.txt | |
continue | |
fi | |
grep -onHE -e ".. _[^\]+" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${file} | sed 's/.. _[^\:]*: //' >> links_list.txt | |
done | |
cat links_list.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -E 'https://|http://' > ExternalLinks.txt | |
cat links_list.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -vE 'https://|http://' > InternalLinks.txt | |
- name: Test External links | |
run: | | |
touch checkedLinksCache.txt | |
IFS=$'\n' | |
for line in $(cat ExternalLinks.txt ) | |
do | |
link=$( echo $line | cut -d ':' -f3- ) | |
link=${link%.} #removing trailing . | |
link=${link% } #removing trailing space | |
checkCache=$( cat checkedLinksCache.txt | grep "$link~" | wc -w ) | |
try=3 | |
if [[ $checkCache -eq 0 ]]; then | |
while [[ $try -ne 0 ]] | |
do | |
status_code=$(curl -LI -m 60 -s $link | grep "HTTP" | tail -1 | cut -d' ' -f2 ) | |
if [[ -n $status_code ]]; then | |
echo "$link~$status_code" >> checkedLinksCache.txt | |
break | |
else | |
echo $line | |
echo "retrying..." | |
try=$(( try - 1)) | |
fi | |
done | |
else | |
status_code=$( cat checkedLinksCache.txt | grep "$link~" | cut -d '~' -f2 ) | |
fi | |
if [[ $status_code -eq 404 ]]; then | |
echo -e "${link} - \033[0;31m404 Error\033[0m" | |
echo "${line}" >> error-report.log | |
elif [[ ! -n $status_code ]]; then | |
echo -e "${link} - \033[0;31mNo Response\033[0m" | |
echo "${line}(No Response)" >> error-report.log | |
else | |
echo "${link} - ${status_code}" | |
fi | |
done | |
- name: Test Internal Links | |
run: | | |
for line in $( cat InternalLinks.txt ) | |
do | |
reference=$( echo $line | cut -d ':' -f3- ) | |
file=$( echo $line | cut -d ':' -f1 ) | |
if [[ ${reference:0:1} == '#' ]]; then | |
Link_text=$( cat $file | grep -oE "\[.*\]\(${reference}\)" | sed 's/\[//' | cut -d ']' -f1 ) | |
isPresent=$(cat $file | grep -oE "# ${Link_text}" | wc -w) | |
if [[ $isPresent -eq 0 ]]; then | |
echo -e "${line} -\u001b[31m invalid reference" | |
echo "${line}" >> error-report.log | |
fi | |
else | |
if [[ ${reference:0:1} == '/' ]]; then | |
baseDir=$PWD | |
else | |
baseDir=${file/$( basename $file )} | |
fi | |
searchFile="$baseDir/${reference}" | |
searchFile=$( realpath $searchFile ) | |
if [[ ! -f $searchFile ]]; then | |
echo -e "${line} -\u001b[31m invalid reference" | |
echo ${line/$reference/$searchFile} >> error-report.log | |
fi | |
fi | |
done | |
- name: Report Error links | |
run: | | |
Number_of_404_links=$( cat error-report.log | wc -l ) | |
echo -e "\u001b[32mNo. of files scanned : $( cat *FilesList.txt | wc -l )" | |
if [[ $Number_of_404_links -ne 0 ]]; then | |
echo -e "\u001b[31mNo. of unique broken links : $( cat error-report.log | cut -d: -f3- | sort | uniq | wc -l )" | |
echo -e "\u001b[31mTotal No. of reference to broken links : $( cat error-report.log | cut -d: -f3- | sort | wc -l )" | |
exit -1 | |
else | |
echo -e "\u001b[32mNo Broken-links found" | |
fi | |
- name: Modify Log file | |
if: ${{ failure() || cancelled() }} | |
run: | | |
baseDir=${PWD%$(basename $PWD)} | |
baseDir=$(echo $baseDir | sed 's/\//\\\//g') | |
sed -i "s/${baseDir}//g" error-report.log | |
fileNames=$(cat error-report.log | cut -d ':' -f1 | sort | uniq ) | |
fileCount=1 | |
for line in $fileNames | |
do | |
rawLines=$( cat error-report.log | grep $line | cut -d ':' -f2- ) | |
echo "$fileCount. $line" >> error-reportTmp.log | |
fileCount=$(( fileCount + 1)) | |
for rawLine in $rawLines | |
do | |
echo -e "\t Line $rawLine" | sed 's/:/ : /' >> error-reportTmp.log | |
done | |
done | |
if [[ $(cat missingFiles.txt | wc -w ) -eq 0 ]]; then | |
echo -e "Broken links: \n" > error-report.log | |
cat error-reportTmp.log >> error-report.log | |
else | |
echo -e "Missing Files: \n" > error-report.log | |
cat missingFiles.txt >> error-report.log | |
echo -e "Broken links: \n" >> error-report.log | |
cat error-reportTmp.log >> error-report.log | |
fi | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: ${{ failure() || cancelled() }} | |
with: | |
name: Hyperlinks-testing-log | |
path: | | |
/home/runner/work/HPCC-Platform/HPCC-Platform/error-report.log | |
/home/runner/work/HPCC-Platform/HPCC-Platform/*FilesList.txt | |
/home/runner/work/HPCC-Platform/HPCC-Platform/checkedLinksCache.txt | |
/home/runner/work/HPCC-Platform/HPCC-Platform/*Links.txt | |
/home/runner/work/HPCC-Platform/HPCC-Platform/links_list.txt | |
/home/runner/work/HPCC-Platform/HPCC-Platform/changed_files.txt | |
/home/runner/work/HPCC-Platform/HPCC-Platform/rejected_links.txt | |
if-no-files-found: ignore |