website: Add Python script to check relative links.

This commit is contained in:
Qi Xiao 2021-11-29 19:21:51 +00:00
parent 7f820c0ff1
commit b342163e4e
2 changed files with 71 additions and 0 deletions

View File

@ -24,6 +24,9 @@ docset: $(TOOL_BINS) $(HTMLS)
publish: gen
rsync -aLv --delete ./_dst/ $(PUBLISH_DIR)/
check-rellinks: gen
python3 tools/check-rellinks.py $(DST_DIR)
clean:
rm -rf $(TOOL_BINS) $(HTMLS) $(DST_DIR) $(DOCSET_TMP_DIR) $(DOCSET_DST_DIR)

View File

@ -0,0 +1,68 @@
import dataclasses
import glob
import os
import os.path
import sys
import urllib.parse
import bs4
@dataclasses.dataclass
class Link:
href: str
parsed: urllib.parse.ParseResult
def main(args):
if len(args) != 2:
print('Usage: check-rellinks dir')
sys.exit(1)
os.chdir(args[1])
filenames = glob.glob('**/*.html', recursive=True)
targets = {}
rellinks = {}
for filename in filenames:
with open(filename) as f:
soup = bs4.BeautifulSoup(f, 'html.parser')
links = [Link(href=e['href'], parsed=urllib.parse.urlparse(e['href']))
for e in soup.find_all('a', href=True)]
rellinks[filename] = [link for link in links if link.parsed.scheme == '']
targets[filename] = [e['id'] for e in soup.find_all(id=True)]
def check(path, fragment):
if path.endswith('.atom') and fragment == '':
return True
return path in targets and (fragment == '' or fragment in targets[path])
has_broken = False
for filename in rellinks:
if filename.endswith('-release-notes.html'):
continue
dirname = os.path.dirname(filename)
broken_links = []
for link in rellinks[filename]:
path = link.parsed.path
if path == '':
path = filename
else:
if os.path.splitext(path)[1] == '':
path += '/index.html'
if path.startswith('/'):
path = path.lstrip('/')
else:
path = os.path.normpath(os.path.join(dirname, path))
if not check(path, link.parsed.fragment):
broken_links.append(link.href)
if broken_links:
if not has_broken:
print('Found broken links:')
has_broken = True
print(filename)
for link in broken_links:
print(f' {link}')
if has_broken:
sys.exit(1)
if __name__ == '__main__':
main(sys.argv)