HPCC-32003 Automated testing of hyperlinks in Documentation files #6
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Test Hyperlinks | |
on: | |
pull_request: | |
branches: | |
- "master" | |
- "candidate-*" | |
- "!candidate-9.4.*" | |
- "!candidate-9.2.*" | |
- "!candidate-9.0.*" | |
- "!candidate-8.*" | |
- "!candidate-7.*" | |
- "!candidate-6.*" | |
# paths: | |
# - '**.md' | |
# - '**.rst' | |
# - '**.xml' | |
workflow_dispatch: | |
inputs: | |
full-scan: | |
description: 'Scan all files' | |
required: false | |
type: boolean | |
default: false | |
jobs: | |
main: | |
runs-on: ubuntu-22.04 | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- name: List Docs | |
run: | | |
if [[ true ]]; then | |
find $PWD -name '*.xml' -type f > xmlFilesList.txt | |
find $PWD -name '*.md' -type f > mdFilesList.txt | |
find $PWD -name '*.rst' -type f > rstFilesList.txt | |
else | |
git diff --name-only HEAD^1 HEAD > changed_files.txt | |
cat changed_files.txt | grep -E "*.xml" > xmlFilesList.txt | |
cat changed_files.txt | grep -E "*.md" > mdFilesList.txt | |
cat changed_files.txt | grep -E "*.rst" > rstFilesList.txt | |
fi | |
- name: List links from Docs | |
run: | | |
IFS=$'\n' | |
for file in $( cat xmlFilesList.txt ) | |
do | |
grep -onHE -e "<ulink" -e 'url="http[^\"\]+' -e "</ulink>" ${file} | sed 's/url="//' > links.tmp | |
flag=0 | |
for line in $( cat links.tmp ) | |
do | |
link=$( echo $line | cut -d ':' -f3- ) | |
if [[ ${link:0:6} == '<ulink' ]]; then | |
flag=1 | |
continue | |
elif [[ ${link:0:8} == '</ulink>' ]]; then | |
flag=0 | |
continue | |
fi | |
if [[ $flag -eq 1 ]]; then | |
echo $line >> links_list.txt | |
else | |
echo $line >> rejected_links.txt | |
fi | |
done | |
done | |
for file in $( cat mdFilesList.txt ) | |
do | |
grep -onHE -e "\]\([^\)]+" -e "\`\`\`[^\`]*" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${file} | sed 's/](//' > links.tmp | |
flag=0 | |
for line in $( cat links.tmp ) | |
do | |
link=$( echo $line | cut -d ':' -f3- ) | |
if [[ ${link:0:3} == '```' ]]; then | |
flag=$(( 1 - flag )) | |
continue | |
fi | |
if [[ $flag -eq 0 ]]; then | |
echo $line >> links_list.txt | |
fi | |
done | |
done | |
for file in $( cat rstFilesList.txt ) | |
do | |
grep -onHE -e ".. _[^\]+" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${file} | sed 's/.. _[^\:]*: //' >> links_list.txt | |
done | |
cat links_list.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -E 'https://|http://' > ExternalLinks.txt | |
cat links_list.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -vE 'https://|http://' > InternalLinks.txt | |
- name: Test External links | |
run: | | |
touch checkedLinksCache.txt | |
IFS=$'\n' | |
for line in $(cat ExternalLinks.txt ) | |
do | |
link=$( echo $line | cut -d ':' -f3- ) | |
link=${link%.} #removing trailing . | |
link=${link% } #removing trailing space | |
checkCache=$( cat checkedLinksCache.txt | grep "$link~" | wc -w ) | |
try=3 | |
if [[ $checkCache -eq 0 ]]; then | |
while [[ $try -ne 0 ]] | |
do | |
status_code=$(curl -LI -m 60 -s $link | grep "HTTP" | tail -1 | cut -d' ' -f2 ) | |
if [[ -n $status_code ]]; then | |
echo "$link~$status_code" >> checkedLinksCache.txt | |
break | |
else | |
echo $line | |
echo "retrying..." | |
try=$(( try - 1)) | |
fi | |
done | |
else | |
status_code=$( cat checkedLinksCache.txt | grep "$link~" | cut -d '~' -f2 ) | |
fi | |
if [[ $status_code -eq 404 ]]; then | |
echo -e "${link} - \033[0;31m404 Error\033[0m" | |
echo "${line}" >> error-report.log | |
else | |
echo "${link} - ${status_code}" | |
fi | |
done | |
- name: Test Internal Links | |
run: | | |
for line in $( cat InternalLinks.txt ) | |
do | |
reference=$( echo $line | cut -d ':' -f3- ) | |
file=$( echo $line | cut -d ':' -f1 ) | |
if [[ ${reference:0:1} == '#' ]]; then | |
Link_text=$( cat $file | grep -oE "\[.*\]\(${reference}\)" | sed 's/\[//' | cut -d ']' -f1 ) | |
isPresent=$(cat $file | grep -oE "# ${Link_text}" | wc -w) | |
if [[ $isPresent -eq 0 ]]; then | |
echo "${line}" >> error-report.log | |
fi | |
else | |
if [[ ${reference:0:1} == '/' ]]; then | |
baseDir=$PWD | |
else | |
baseDir=${file/$( basename $file )} | |
fi | |
searchFile="$baseDir/${reference}" | |
searchFile=$( realpath $searchFile ) | |
if [[ ! -f $searchFile ]]; then | |
echo "${line}" >> error-report.log | |
fi | |
fi | |
done | |
- name: report Error links | |
run: | | |
Number_of_404_links=$( cat error-report.log | wc -l ) | |
echo -e "\u001b[32mNo. of files scanned : $( cat *FilesList.txt | wc -l )" | |
if [[ $Number_of_404_links -ne 0 ]]; then | |
echo -e "\u001b[31mNo. of unique Broken links : $( cat error-report.log | cut -d: -f3- | sort | uniq | wc -l )" | |
echo -e "\u001b[31mTotal No. of Broken links : $( cat error-report.log | cut -d: -f3- | sort | wc -l )" | |
exit -1 | |
else | |
echo -e "\u001b[32mNo Broken-links found" | |
fi | |
- name: Modify Log file | |
if: ${{ failure() || cancelled() }} | |
run: | | |
baseDir=${PWD/$(basename $PWD)} | |
baseDir=$(echo $baseDir | sed 's/\//\\\//g') | |
sed -i "s/${baseDir}//g" error-report.log | |
fileNames=$(cat Temp/BrokenLinks.txt | cut -d ':' -f1 | sort | uniq ) | |
fileCount=1 | |
for line in $fileNames | |
do | |
rawLines=$( cat Temp/BrokenLinks.txt | grep $line | cut -d ':' -f2- ) | |
echo "$fileCount. $line" >> error-report.log | |
fileCount=$(( fileCount + 1)) | |
for rawLine in $rawLines | |
do | |
echo -e "\t Line $rawLine" | sed 's/:/ : /' >> error-report.log | |
done | |
done | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: ${{ failure() || cancelled() }} | |
with: | |
name: Hyperlinks-testing-log | |
path: | | |
/home/runner/work/HPCC-Platform/HPCC-Platform/error-report.log | |
/home/runner/work/HPCC-Platform/HPCC-Platform/*FilesList.txt | |
/home/runner/work/HPCC-Platform/HPCC-Platform/checkedLinksCache.txt | |
/home/runner/work/HPCC-Platform/HPCC-Platform/*Links.txt | |
/home/runner/work/HPCC-Platform/HPCC-Platform/links_list.txt | |
/home/runner/work/HPCC-Platform/HPCC-Platform/changed_files.txt | |
/home/runner/work/HPCC-Platform/HPCC-Platform/rejected_links.txt | |
if-no-files-found: ignore |