5 # Well, this wasn't supposed to be so long and complicated.
6 # Anyway, it makes sure the wiki works on both Gitlab and Github by moving
7 # stuff around and fixing links. Then it reports all remaining broken links
8 # and unused files. Since the wiki is in git, you can use `git status`
9 # and `git diff` to see the changes. You can also use the `--dry-run` flag
10 # to print all changes the script would make without actually making them.
12 # See Editing.md for more information.
14 # Some stuff that could have been done better:
15 # - Not parsing Markdown with regex. Currently, we for example report
16 # broken links even though they're inside code blocks (e.g. Irclog.md)
17 # - Using the type system (and mypy) to distinguish different link types
18 # to make sure the right functions are called with the right link types
19 # (e.g. page links, file links, links with headers, urls, ...)
20 # - Checking outbound links for 404s.
25 import regex # sudo pip3 install regex
28 from os.path import normpath, join, dirname, basename
31 # yeah, well, this is ugly but sure beats putting the regex on one line
32 def compile_regex(rgx: str):
33 # regex (unlike re) supports non-constant length look-behinds
36 [line.strip() for line in rgx]))
40 # [Page link](Some_Page)
41 # [Url link](http://example.com)
42 # ![Image](image_1.png)
43 # [![Image link to image](image_inner.png)](image_outer.png)
44 # [![Image link to page](image_inner.png)](Archive/Some_Page)
46 # regex.sub doesnt support overlapping - we have to use lookbehinds.
47 # Practically, the inner link will never be a page so we don't need to
48 # sub it, but later we can reuse the regex to go through all the links
49 # and check that they're valid.
50 LINK_REGEX = compile_regex("""
74 def strip_header_link(link: str) -> str:
75 "remove links to headers inside the file"
77 header_index = link.rfind('#')
78 if header_index != -1:
79 link = link[:header_index]
83 def convert_page_name(path: str) -> str:
84 "path can be with or without .md"
86 if path.startswith("_"):
87 # ignore header, footer etc
91 # don't wanna break stuff like mapping-entity-func_door
94 headerless = strip_header_link(path)
95 # don't reformat these links because they're often linked to from outside
96 for exc in ["Repository_Access", "Halogenes_Newbie_Corner"]:
97 if headerless == exc or headerless == exc + ".md":
100 return basename(path).replace("_", "-")
103 def convert_page_link(link: str) -> str:
104 header_index = link.rfind('#')
105 if header_index != -1:
106 header = link[header_index + 1:]
108 print("warning: underscore in header: {}".format(link))
109 return convert_page_name(link)
112 def find_paths() -> Tuple[List[str], List[str]]:
113 all_paths = sorted(filter(
115 [name for name in glob.iglob('**', recursive=True)]))
116 md_paths = sorted(filter(lambda s: s.endswith(".md"), all_paths))
117 return all_paths, md_paths
120 def fix_dir_structure():
121 _, md_paths = find_paths()
122 for path in md_paths:
123 fixed = convert_page_name(path)
127 if os.path.exists(fixed):
128 print("warning: collision: {}".format(path))
130 print("would rename {} to {}".format(path, fixed))
132 os.rename(path, fixed)
135 def is_between_files(link: str) -> bool:
136 if "://" in link or link.startswith("#"):
137 # http(s) link or link to header on the same page
143 def is_page_link(link: str) -> bool:
144 # this is a best guess, i don't think there is a foolproof way to tell
146 if link.startswith("assets") or link.startswith("img"):
147 # hopefully nobody adds more directories
149 if "." in basename(link):
150 # hopefully it's an extension
152 # files in root without extension will fail
157 def replace_link(changes: List[str], match) -> str:
159 link_start = match.start(1) - match.start()
160 link_end = match.end(1) - match.start()
162 link = text[link_start:link_end]
164 if is_between_files(link) and is_page_link(link):
165 new_link = convert_page_link(link)
166 new_text = text[:link_start] + new_link + text[link_end:]
168 changes.append("\t{} -> {}".format(text, new_text))
175 _, md_paths = find_paths()
176 for path in md_paths:
177 with open(path, 'r+') as f:
181 replacer = functools.partial(replace_link, changes)
182 contents_new = LINK_REGEX.sub(replacer, contents)
183 if dry_run and any(changes):
184 print("would convert these links in {}:".format(path))
185 for change in changes:
188 if not dry_run and contents != contents_new:
190 f.write(contents_new)
194 def link_to_path(current_file: str, link: str) -> str:
196 # gitlab root current current root
197 # gollum current current current root
198 # github ok ok broken broken
200 # when not using subdirs, nothing or "." works for all 3
202 if link.startswith("..") or link.startswith("/"):
203 print("file: {} bad link: {}", link)
205 # path relative to wiki root, not curent file
206 current_dir = dirname(current_file)
207 link = normpath(join(current_dir, link))
209 link = strip_header_link(link)
211 # page links don't have an extension - add it
212 extension_index = link.rfind('.')
213 if extension_index == -1:
219 def get_file_links(path: str) -> Generator[str, None, None]:
220 with open(path, 'r') as f:
222 for match in LINK_REGEX.finditer(contents):
223 link = match.group(1)
225 if is_between_files(link):
229 def canonicalize(path: str) -> str:
230 # spaces and capitalization don't seem to matter for pages
231 if path.endswith(".md"):
232 return path.replace(" ", "-").casefold()
237 def find_broken(all_paths: List[str], md_paths: List[str]):
238 canonical_paths = [canonicalize(path) for path in all_paths]
240 for path in md_paths:
241 if path == "Irclog.md":
242 continue # TODO need to parse MD properly to avoid false posiives
243 for link in get_file_links(path):
244 link_target = canonicalize(link_to_path(path, link))
245 if not link_target in canonical_paths:
246 #print("broken link in {}: {} -> {}".format(path, link, link_target))
247 print("broken link in {}: {}".format(path, link))
250 def walk_links(canonical_to_real: Dict[str, str], is_linked: Dict[str, bool], current_path: str):
251 canonical = canonicalize(current_path)
252 if canonical not in canonical_to_real:
253 # broken link - nothing to do here, we check broken links elsewhere
254 # because here we're not guaranteed to walk through all files
255 #print("not in known paths: {}".format(current_path))
258 current_path = canonical_to_real[canonical]
260 if is_linked[current_path]:
263 is_linked[current_path] = True
264 if current_path.endswith(".md"):
265 for link in get_file_links(current_path):
266 link_target = link_to_path(current_path, link)
267 walk_links(canonical_to_real, is_linked, link_target)
270 def find_unlinked(all_paths: List[str]):
271 canonical_to_real = {canonicalize(path): path for path in all_paths}
272 is_linked = {path: False for path in all_paths}
274 # ignore these 2 - currently they don't show on GitLab but do on GitHub
275 is_linked["_Footer.md"] = True
276 is_linked["_Sidebar.md"] = True
278 walk_links(canonical_to_real, is_linked, "Home.md")
280 for path, linked in is_linked.items():
282 print("not reachable from Home: {}".format(path))
286 all_paths, md_paths = find_paths()
287 find_broken(all_paths, md_paths)
288 find_unlinked(all_paths)
293 if len(sys.argv) > 1 and sys.argv[1] == "--dry-run":
296 # convert file paths - put everything into root
299 # convert links on all pages
302 # look for broken links and unlinked files
306 if __name__ == '__main__':