3 # Well, this wasn't supposed to be so long and complicated.
4 # Anyway, it makes sure the wiki works on both Gitlab and Github by moving
5 # stuff around and fixing links. Then it reports all remaining broken links
6 # and unused files. Since the wiki is in git, you can use `git status`
7 # and `git diff` to see the changes. You can also use the `--dry-run` flag
8 # to print all changes the script would make without actually making them.
10 # See Editing.md for more information.
12 # Some stuff that could have been done better:
13 # - Not parsing Markdown with regex. Currently, we for example report
14 # broken links even though they're inside code blocks (e.g. Irclog.md)
15 # - Using the type system (and mypy) to distinguish different link types
16 # to make sure the right functions are called with the right link types
17 # (e.g. page links, file links, links with headers, urls, ...)
18 # - Checking outbound links for 404s.
23 import regex # sudo pip3 install regex
26 from os.path import normpath, join, dirname, basename
29 # yeah, well, this is ugly but sure beats putting the regex on one line
30 def compile_regex(rgx: str):
31 # regex (unlike re) supports non-constant length look-behinds
34 [line.strip() for line in rgx]))
38 # [Page link](Some_Page)
39 # [Url link](http://example.com)
40 # ![Image](image_1.png)
41 # [![Image link to image](image_inner.png)](image_outer.png)
42 # [![Image link to page](image_inner.png)](Archive/Some_Page)
44 # regex.sub doesnt support overlapping - we have to use lookbehinds.
45 # Practically, the inner link will never be a page so we don't need to
46 # sub it, but later we can reuse the regex to go through all the links
47 # and check that they're valid.
48 LINK_REGEX = compile_regex("""
72 def strip_header_link(link: str) -> str:
73 "remove links to headers inside the file"
75 header_index = link.rfind('#')
76 if header_index != -1:
77 link = link[:header_index]
81 def convert_page_name(path: str) -> str:
82 "path can be with or without .md"
84 if path.startswith("_"):
85 # ignore header, footer etc
89 # don't wanna break stuff like mapping-entity-func_door
92 headerless = strip_header_link(path)
93 # don't reformat these links because they're often linked to from outside
94 for exc in ["Repository_Access", "Halogenes_Newbie_Corner"]:
95 if headerless == exc or headerless == exc + ".md":
98 return basename(path).replace("_", "-")
101 def convert_page_link(link: str) -> str:
102 header_index = link.rfind('#')
103 if header_index != -1:
104 header = link[header_index + 1:]
106 print("warning: underscore in header: {}".format(link))
107 return convert_page_name(link)
110 def find_paths() -> Tuple[List[str], List[str]]:
111 all_paths = sorted(filter(
113 [name for name in glob.iglob('**', recursive=True)]))
114 md_paths = sorted(filter(lambda s: s.endswith(".md"), all_paths))
115 return all_paths, md_paths
118 def fix_dir_structure():
119 _, md_paths = find_paths()
120 for path in md_paths:
121 fixed = convert_page_name(path)
125 if os.path.exists(fixed):
126 print("warning: collision: {}".format(path))
128 print("would rename {} to {}".format(path, fixed))
130 os.rename(path, fixed)
133 def is_between_files(link: str) -> bool:
134 if "://" in link or link.startswith("#"):
135 # http(s) link or link to header on the same page
141 def is_page_link(link: str) -> bool:
142 # this is a best guess, i don't think there is a foolproof way to tell
144 if link.startswith("assets") or link.startswith("img"):
145 # hopefully nobody adds more directories
147 if "." in basename(link):
148 # hopefully it's an extension
150 # files in root without extension will fail
155 def replace_link(changes: List[str], match) -> str:
157 link_start = match.start(1) - match.start()
158 link_end = match.end(1) - match.start()
160 link = text[link_start:link_end]
162 if is_between_files(link) and is_page_link(link):
163 new_link = convert_page_link(link)
164 new_text = text[:link_start] + new_link + text[link_end:]
166 changes.append("\t{} -> {}".format(text, new_text))
173 _, md_paths = find_paths()
174 for path in md_paths:
175 with open(path, 'r+') as f:
179 replacer = functools.partial(replace_link, changes)
180 contents_new = LINK_REGEX.sub(replacer, contents)
181 if dry_run and any(changes):
182 print("would convert these links in {}:".format(path))
183 for change in changes:
186 if not dry_run and contents != contents_new:
188 f.write(contents_new)
192 def link_to_path(current_file: str, link: str) -> str:
194 # gitlab root current current root
195 # gollum current current current root
196 # github ok ok broken broken
198 # when not using subdirs, nothing or "." works for all 3
200 if link.startswith("..") or link.startswith("/"):
201 print("file: {} bad link: {}", link)
203 # path relative to wiki root, not curent file
204 current_dir = dirname(current_file)
205 link = normpath(join(current_dir, link))
207 link = strip_header_link(link)
209 # page links don't have an extension - add it
210 extension_index = link.rfind('.')
211 if extension_index == -1:
217 def get_file_links(path: str) -> Generator[str, None, None]:
218 with open(path, 'r') as f:
220 for match in LINK_REGEX.finditer(contents):
221 link = match.group(1)
223 if is_between_files(link):
227 def canonicalize(path: str) -> str:
228 # spaces and capitalization don't seem to matter for pages
229 if path.endswith(".md"):
230 return path.replace(" ", "-").casefold()
235 def find_broken(all_paths: List[str], md_paths: List[str]):
236 canonical_paths = [canonicalize(path) for path in all_paths]
238 for path in md_paths:
239 if path == "Irclog.md":
240 continue # TODO need to parse MD properly to avoid false posiives
241 for link in get_file_links(path):
242 link_target = canonicalize(link_to_path(path, link))
243 if not link_target in canonical_paths:
244 #print("broken link in {}: {} -> {}".format(path, link, link_target))
245 print("broken link in {}: {}".format(path, link))
248 def walk_links(canonical_to_real: Dict[str, str], is_linked: Dict[str, bool], current_path: str):
249 canonical = canonicalize(current_path)
250 if canonical not in canonical_to_real:
251 # broken link - nothing to do here, we check broken links elsewhere
252 # because here we're not guaranteed to walk through all files
253 #print("not in known paths: {}".format(current_path))
256 current_path = canonical_to_real[canonical]
258 if is_linked[current_path]:
261 is_linked[current_path] = True
262 if current_path.endswith(".md"):
263 for link in get_file_links(current_path):
264 link_target = link_to_path(current_path, link)
265 walk_links(canonical_to_real, is_linked, link_target)
268 def find_unlinked(all_paths: List[str]):
269 canonical_to_real = {canonicalize(path): path for path in all_paths}
270 is_linked = {path: False for path in all_paths}
272 # ignore these 2 - currently they don't show on GitLab but do on GitHub
273 is_linked["_Footer.md"] = True
274 is_linked["_Sidebar.md"] = True
276 walk_links(canonical_to_real, is_linked, "Home.md")
278 for path, linked in is_linked.items():
280 print("not reachable from Home: {}".format(path))
284 all_paths, md_paths = find_paths()
285 find_broken(all_paths, md_paths)
286 find_unlinked(all_paths)
291 if len(sys.argv) > 1 and sys.argv[1] == "--dry-run":
294 # convert file paths - put everything into root
297 # convert links on all pages
300 # look for broken links and unlinked files
304 if __name__ == '__main__':