assets/check-and-fix.py

   1 #!/usr/bin/env python3
   2
   3 # Run in wiki root
   4
   5 # Well, this wasn't supposed to be so long and complicated.
   6 # Anyway, it makes sure the wiki works on both Gitlab and Github by moving
   7 # stuff around and fixing links. Then it reports all remaining broken links
   8 # and unused files. Since the wiki is in git, you can use `git status`
   9 # and `git diff` to see the changes. You can also use the `--dry-run` flag
  10 # to print all changes the script would make without actually making them.
  11
  12 # See Editing.md for more information.
  13
  14 # Some stuff that could have been done better:
  15 #  - Not parsing Markdown with regex. Currently, we for example report
  16 #    broken links even though they're inside code blocks (e.g. Irclog.md)
  17 #  - Using the type system (and mypy) to distinguish different link types
  18 #    to make sure the right functions are called with the right link types
  19 #    (e.g. page links, file links, links with headers, urls, ...)
  20 #  - Checking outbound links for 404s.
  21
  22 import sys
  23 import os
  24 import glob
  25 import regex  # sudo pip3 install regex
  26 import functools
  27 from typing import *
  28 from os.path import normpath, join, dirname, basename
  29
  30
  31 # yeah, well, this is ugly but sure beats putting the regex on one line
  32 def compile_regex(rgx: str):
  33     # regex (unlike re) supports non-constant length look-behinds
  34     return regex.compile(
  35         "".join(
  36             [line.strip() for line in rgx]))
  37
  38
  39 # examples:
  40 # [Page link](Some_Page)
  41 # [Url link](http://example.com)
  42 # ![Image](image_1.png)
  43 # [![Image link to image](image_inner.png)](image_outer.png)
  44 # [![Image link to page](image_inner.png)](Archive/Some_Page)
  45
  46 # regex.sub doesnt support overlapping - we have to use lookbehinds.
  47 # Practically, the inner link will never be a page so we don't need to
  48 # sub it, but later we can reuse the regex to go through all the links
  49 # and check that they're valid.
  50 LINK_REGEX = compile_regex("""
  51 (?<=
  52     \[
  53         (?:
  54             [^\[\]]*
  55         |
  56             \!\[
  57                 [^\[\]]*
  58             \]
  59             \(
  60                 [^()]*
  61             \)
  62         )
  63     \]
  64 )
  65 \(
  66     ([^()]*)
  67 \)
  68 """)
  69
  70
  71 dry_run = False
  72
  73
  74 def strip_header_link(link: str) -> str:
  75     "remove links to headers inside the file"
  76
  77     header_index = link.rfind('#')
  78     if header_index != -1:
  79         link = link[:header_index]
  80     return link
  81
  82
  83 def convert_page_name(path: str) -> str:
  84     "path can be with or without .md"
  85
  86     if path.startswith("_"):
  87         # ignore header, footer etc
  88         return path
  89
  90     if "-" in path:
  91         # don't wanna break stuff like mapping-entity-func_door
  92         return path
  93
  94     headerless = strip_header_link(path)
  95     # don't reformat these links because they're often linked to from outside
  96     for exc in ["Repository_Access", "Halogenes_Newbie_Corner"]:
  97         if headerless == exc or headerless == exc + ".md":
  98             return path
  99
 100     return basename(path).replace("_", "-")
 101
 102
 103 def convert_page_link(link: str) -> str:
 104     header_index = link.rfind('#')
 105     if header_index != -1:
 106         header = link[header_index + 1:]
 107         if "_" in header:
 108             print("warning: underscore in header: {}".format(link))
 109     return convert_page_name(link)
 110
 111
 112 def find_paths() -> Tuple[List[str], List[str]]:
 113     all_paths = sorted(filter(
 114         os.path.isfile,
 115         [name for name in glob.iglob('**', recursive=True)]))
 116     md_paths = sorted(filter(lambda s: s.endswith(".md"), all_paths))
 117     return all_paths, md_paths
 118
 119
 120 def fix_dir_structure():
 121     _, md_paths = find_paths()
 122     for path in md_paths:
 123         fixed = convert_page_name(path)
 124         if fixed == path:
 125             continue
 126
 127         if os.path.exists(fixed):
 128             print("warning: collision: {}".format(path))
 129         elif dry_run:
 130             print("would rename {} to {}".format(path, fixed))
 131         else:
 132             os.rename(path, fixed)
 133
 134
 135 def is_between_files(link: str) -> bool:
 136     if "://" in link or link.startswith("#"):
 137         # http(s) link or link to header on the same page
 138         return False
 139     else:
 140         return True
 141
 142
 143 def is_page_link(link: str) -> bool:
 144     # this is a best guess, i don't think there is a foolproof way to tell
 145
 146     if link.startswith("assets") or link.startswith("img"):
 147         # hopefully nobody adds more directories
 148         return False
 149     if "." in basename(link):
 150         # hopefully it's an extension
 151         return False
 152     # files in root without extension will fail
 153
 154     return True
 155
 156
 157 def replace_link(changes: List[str], match) -> str:
 158     text = match.group()
 159     link_start = match.start(1) - match.start()
 160     link_end = match.end(1) - match.start()
 161
 162     link = text[link_start:link_end]
 163
 164     if is_between_files(link) and is_page_link(link):
 165         new_link = convert_page_link(link)
 166         new_text = text[:link_start] + new_link + text[link_end:]
 167         if text != new_text:
 168             changes.append("\t{} -> {}".format(text, new_text))
 169         return new_text
 170     else:
 171         return text
 172
 173
 174 def fix_links():
 175     _, md_paths = find_paths()
 176     for path in md_paths:
 177         with open(path, 'r+') as f:
 178             contents = f.read()
 179
 180             changes = []
 181             replacer = functools.partial(replace_link, changes)
 182             contents_new = LINK_REGEX.sub(replacer, contents)
 183             if dry_run and any(changes):
 184                 print("would convert these links in {}:".format(path))
 185                 for change in changes:
 186                     print(change)
 187
 188             if not dry_run and contents != contents_new:
 189                 f.seek(0)
 190                 f.write(contents_new)
 191                 f.truncate()
 192
 193
 194 def link_to_path(current_file: str, link: str) -> str:
 195     #           nothing     .           ..          /
 196     # gitlab    root        current     current     root
 197     # gollum    current     current     current     root
 198     # github    ok          ok          broken      broken
 199
 200     # when not using subdirs, nothing or "." works for all 3
 201
 202     if link.startswith("..") or link.startswith("/"):
 203         print("file: {} bad link: {}", link)
 204
 205     # path relative to wiki root, not curent file
 206     current_dir = dirname(current_file)
 207     link = normpath(join(current_dir, link))
 208
 209     link = strip_header_link(link)
 210
 211     # page links don't have an extension - add it
 212     extension_index = link.rfind('.')
 213     if extension_index == -1:
 214         link = link + '.md'
 215
 216     return link
 217
 218
 219 def get_file_links(path: str) -> Generator[str, None, None]:
 220     with open(path, 'r') as f:
 221         contents = f.read()
 222         for match in LINK_REGEX.finditer(contents):
 223             link = match.group(1)
 224
 225             if is_between_files(link):
 226                 yield link
 227
 228
 229 def canonicalize(path: str) -> str:
 230     # spaces and capitalization don't seem to matter for pages
 231     if path.endswith(".md"):
 232         return path.replace(" ", "-").casefold()
 233     else:
 234         return path
 235
 236
 237 def find_broken(all_paths: List[str], md_paths: List[str]):
 238     canonical_paths = [canonicalize(path) for path in all_paths]
 239
 240     for path in md_paths:
 241         if path == "Irclog.md":
 242             continue  # TODO need to parse MD properly to avoid false posiives
 243         for link in get_file_links(path):
 244             link_target = canonicalize(link_to_path(path, link))
 245             if not link_target in canonical_paths:
 246                 #print("broken link in {}: {} -> {}".format(path, link, link_target))
 247                 print("broken link in {}: {}".format(path, link))
 248
 249
 250 def walk_links(canonical_to_real: Dict[str, str], is_linked: Dict[str, bool], current_path: str):
 251     canonical = canonicalize(current_path)
 252     if canonical not in canonical_to_real:
 253         # broken link - nothing to do here, we check broken links elsewhere
 254         # because here we're not guaranteed to walk through all files
 255         #print("not in known paths: {}".format(current_path))
 256         return
 257
 258     current_path = canonical_to_real[canonical]
 259
 260     if is_linked[current_path]:
 261         return
 262
 263     is_linked[current_path] = True
 264     if current_path.endswith(".md"):
 265         for link in get_file_links(current_path):
 266             link_target = link_to_path(current_path, link)
 267             walk_links(canonical_to_real, is_linked, link_target)
 268
 269
 270 def find_unlinked(all_paths: List[str]):
 271     canonical_to_real = {canonicalize(path): path for path in all_paths}
 272     is_linked = {path: False for path in all_paths}
 273
 274     # ignore these 2 - currently they don't show on GitLab but do on GitHub
 275     is_linked["_Footer.md"] = True
 276     is_linked["_Sidebar.md"] = True
 277
 278     walk_links(canonical_to_real, is_linked, "Home.md")
 279
 280     for path, linked in is_linked.items():
 281         if not linked:
 282             print("not reachable from Home: {}".format(path))
 283
 284
 285 def check_links():
 286     all_paths, md_paths = find_paths()
 287     find_broken(all_paths, md_paths)
 288     find_unlinked(all_paths)
 289
 290
 291 def main():
 292     global dry_run
 293     if len(sys.argv) > 1 and sys.argv[1] == "--dry-run":
 294         dry_run = True
 295
 296     # convert file paths - put everything into root
 297     fix_dir_structure()
 298
 299     # convert links on all pages
 300     fix_links()
 301
 302     # look for broken links and unlinked files
 303     check_links()
 304
 305
 306 if __name__ == '__main__':
 307     main()