We all like a puzzle. I'll include my solution here...
for file in *.html
do
awk '{ strng=$0;
split(strng, tmp_arry, "[\<\>]"

;
for(indx in tmp_arry)
if (match(tolower(tmp_arry[indx]),"href *= *\"[^\"]*\""

) {
if (tolower(tmp_arry[indx] ~ "\.htm[l]*\"$"

) {
htm_array[substr(tmp_arry[indx], RSTART, RLENGTH)]++;
htm_cnt++;
} else { if (tolower(tmp_arry[indx] ~ "\.jp[e]*g\"$"

) {
jpg_array[substr(tmp_arry[indx], RSTART, RLENGTH)]++;
jpg_cnt++;
} else {
oth_array[substr(tmp_arry[indx], RSTART, RLENGTH)]++;
oth_cnt++;
}
};
cnt++;
}
}
END { printf "\n%d links in %s\n", cnt, FILENAME;
if (htm_cnt) {
printf "\t%d document links\n", htm_cnt;
for(indx in htm_array)
printf "\t\t%5d of %s\n", htm_array[indx], indx;
};
if (jpg_cnt) {
printf "\t%d image links\n", jpg_cnt;
for(indx in jpg_array)
printf "\t\t%5d of %s\n", jpg_array[indx], indx;
};
if (oth_cnt) {
printf "\t%d other links\n", oth_cnt;
for(indx in oth_array)
printf "\t\t%5d of %s\n", oth_array[indx], indx;
};
}' $file
done
Tested...
0 links in df.html
4 links in eg2.html
1 document links
1 of HREF = "
3 other links
1 of HREF = "
2 of HREF = "
4 links in eg3.html
2 document links
1 of HREF = "
1 of HREF = "
2 image links
1 of HREF = "
1 of HREF = "
0 links in stats.html
3 links in tab0.html
2 document links
1 of HREF="tab1.html"
1 of HREF="tab2.html"
1 other links
1 of HREF="favicon.ico"