5 # Well, this wasn't supposed to be so long and complicated.
6 # Anyway, it makes sure the wiki works on both Gitlab and Github by moving
7 # stuff around and fixing links. Then it reports all broken links
8 # and unused files that can't be fixed automatically. By default it only prints changes it would make to stdout, if you wish to apply them, use `--fix`.
10 # See Editing.md for more information.
12 # Some stuff that could have been done better:
13 # - Not parsing Markdown with regex. Currently, we for example report
14 # broken links even though they're inside code blocks (e.g. Irclog.md)
15 # - Using the type system (and mypy) to distinguish different link types
16 # to make sure the right functions are called with the right link types
17 # (e.g. page links, file links, links with headers, urls, ...)
18 # - Checking outbound links for 404s.
23 import regex # sudo pip3 install regex
26 from os.path import normpath, join, dirname, basename
29 # yeah, well, this is ugly but sure beats putting the regex on one line
30 def compile_regex(rgx: str):
31 # regex (unlike re) supports non-constant length look-behinds
34 [line.strip() for line in rgx]))
38 # [Page link](Some_Page)
39 # [Url link](http://example.com)
40 # ![Image](image_1.png)
41 # [![Image link to image](image_inner.png)](image_outer.png)
42 # [![Image link to page](image_inner.png)](Archive/Some_Page)
44 # regex.sub doesnt support overlapping - we have to use lookbehinds.
45 # Practically, the inner link will never be a page so we don't need to
46 # sub it, but later we can reuse the regex to go through all the links
47 # and check that they're valid.
48 LINK_REGEX = compile_regex("""
72 def strip_header_link(link: str) -> str:
73 "remove links to headers inside the file"
75 header_index = link.rfind('#')
76 if header_index != -1:
77 link = link[:header_index]
81 def convert_page_name(path: str) -> str:
82 "path can be with or without .md"
84 if path.startswith("_"):
85 # ignore header, footer etc
89 # don't wanna break stuff like mapping-entity-func_door
92 headerless = strip_header_link(path)
93 # don't reformat these links because they're often linked to from outside
94 for exc in ["Repository_Access", "Halogenes_Newbie_Corner"]:
95 if headerless == exc or headerless == exc + ".md":
98 return basename(path).replace("_", "-")
101 def convert_page_link(link: str) -> str:
102 header_index = link.rfind('#')
103 if header_index != -1:
104 header = link[header_index + 1:]
106 print("warning: underscore in header: {}".format(link))
107 return convert_page_name(link)
110 def find_paths() -> Tuple[List[str], List[str]]:
111 all_paths = sorted(filter(
113 [name for name in glob.iglob('**', recursive=True)]))
114 md_paths = sorted(filter(lambda s: s.endswith(".md"), all_paths))
115 return all_paths, md_paths
118 def fix_dir_structure():
119 _, md_paths = find_paths()
120 for path in md_paths:
121 fixed = convert_page_name(path)
125 if os.path.exists(fixed):
126 print("warning: collision: {}".format(path))
128 os.rename(path, fixed)
130 print("would rename {} to {}".format(path, fixed))
133 def is_between_files(link: str) -> bool:
134 if "://" in link or link.startswith("#"):
135 # http(s) link or link to header on the same page
141 def is_page_link(link: str) -> bool:
142 # this is a best guess, i don't think there is a foolproof way to tell
144 if link.startswith("assets") or link.startswith("img"):
145 # hopefully nobody adds more directories
147 if "." in basename(link):
148 # hopefully it's an extension
150 # files in root without extension will fail
155 def replace_link(changes: List[str], match) -> str:
157 link_start = match.start(1) - match.start()
158 link_end = match.end(1) - match.start()
160 link = text[link_start:link_end]
162 if is_between_files(link) and is_page_link(link):
163 new_link = convert_page_link(link)
164 new_text = text[:link_start] + new_link + text[link_end:]
166 changes.append("\t{} -> {}".format(text, new_text))
173 _, md_paths = find_paths()
174 for path in md_paths:
175 with open(path, 'r+') as f:
179 replacer = functools.partial(replace_link, changes)
180 contents_new = LINK_REGEX.sub(replacer, contents)
181 if apply_fixes and contents != contents_new:
183 f.write(contents_new)
185 elif not apply_fixes and any(changes):
186 print("would convert these links in {}:".format(path))
187 for change in changes:
191 def link_to_path(current_file: str, link: str) -> str:
193 # gitlab root current current root
194 # gollum current current current root
195 # github ok ok broken broken
197 # when not using subdirs, nothing or "." works for all 3
199 if link.startswith("..") or link.startswith("/"):
200 print("file: {} bad link: {}".format(current_file, link))
202 # path relative to wiki root, not curent file
203 current_dir = dirname(current_file)
204 link = normpath(join(current_dir, link))
206 link = strip_header_link(link)
208 # page links don't have an extension - add it
209 extension_index = link.rfind('.')
210 if extension_index == -1:
216 def get_file_links(path: str) -> Generator[str, None, None]:
217 with open(path, 'r') as f:
219 for match in LINK_REGEX.finditer(contents):
220 link = match.group(1)
222 if is_between_files(link):
226 def canonicalize(path: str) -> str:
227 # spaces and capitalization don't seem to matter for pages
228 if path.endswith(".md"):
229 return path.replace(" ", "-").casefold()
234 def find_broken(all_paths: List[str], md_paths: List[str]):
235 canonical_paths = [canonicalize(path) for path in all_paths]
237 for path in md_paths:
238 if path == "Irclog.md":
239 continue # TODO need to parse MD properly to avoid false posiives
240 for link in get_file_links(path):
241 link_target = canonicalize(link_to_path(path, link))
242 if not link_target in canonical_paths:
243 #print("broken link in {}: {} -> {}".format(path, link, link_target))
244 print("broken link in {}: {}".format(path, link))
247 def walk_links(canonical_to_real: Dict[str, str], is_linked: Dict[str, bool], current_path: str):
248 canonical = canonicalize(current_path)
249 if canonical not in canonical_to_real:
250 # broken link - nothing to do here, we check broken links elsewhere
251 # because here we're not guaranteed to walk through all files
252 #print("not in known paths: {}".format(current_path))
255 current_path = canonical_to_real[canonical]
257 if is_linked[current_path]:
260 is_linked[current_path] = True
261 if current_path.endswith(".md"):
262 for link in get_file_links(current_path):
263 link_target = link_to_path(current_path, link)
264 walk_links(canonical_to_real, is_linked, link_target)
267 def find_unlinked(all_paths: List[str]):
268 canonical_to_real = {canonicalize(path): path for path in all_paths}
269 is_linked = {path: False for path in all_paths}
271 # ignore these 2 - currently they don't show on GitLab but do on GitHub
272 is_linked["_Footer.md"] = True
273 is_linked["_Sidebar.md"] = True
275 walk_links(canonical_to_real, is_linked, "Home.md")
277 for path, linked in sorted(is_linked.items()):
279 print("not reachable from Home: {}".format(path))
283 all_paths, md_paths = find_paths()
284 find_broken(all_paths, md_paths)
285 find_unlinked(all_paths)
290 if len(sys.argv) > 1 and sys.argv[1] == "--fix":
293 # convert file paths - put everything into root
296 # convert links on all pages
299 # look for broken links and unlinked files
303 if __name__ == '__main__':