assets/check-and-fix.py

   1 #!/usr/bin/env python3
   2
   3 # Well, this wasn't supposed to be so long and complicated.
   4 # Anyway, it makes sure the wiki works on both Gitlab and Github by moving
   5 # stuff around and fixing links. Then it reports all remaining broken links
   6 # and unused files. Since the wiki is in git, you can use `git status`
   7 # and `git diff` to see the changes. You can also use the `--dry-run` flag
   8 # to print all changes the script would make without actually making them.
   9
  10 # See Editing.md for more information.
  11
  12 # Some stuff that could have been done better:
  13 #  - Not parsing Markdown with regex. Currently, we for example report
  14 #    broken links even though they're inside code blocks (e.g. Irclog.md)
  15 #  - Using the type system (and mypy) to distinguish different link types
  16 #    to make sure the right functions are called with the right link types
  17 #    (e.g. page links, file links, links with headers, urls, ...)
  18 #  - Checking outbound links for 404s.
  19
  20 import sys
  21 import os
  22 import glob
  23 import regex  # sudo pip3 install regex
  24 import functools
  25 from typing import *
  26 from os.path import normpath, join, dirname, basename
  27
  28
  29 # yeah, well, this is ugly but sure beats putting the regex on one line
  30 def compile_regex(rgx: str):
  31     # regex (unlike re) supports non-constant length look-behinds
  32     return regex.compile(
  33         "".join(
  34             [line.strip() for line in rgx]))
  35
  36
  37 # examples:
  38 # [Page link](Some_Page)
  39 # [Url link](http://example.com)
  40 # ![Image](image_1.png)
  41 # [![Image link to image](image_inner.png)](image_outer.png)
  42 # [![Image link to page](image_inner.png)](Archive/Some_Page)
  43
  44 # regex.sub doesnt support overlapping - we have to use lookbehinds.
  45 # Practically, the inner link will never be a page so we don't need to
  46 # sub it, but later we can reuse the regex to go through all the links
  47 # and check that they're valid.
  48 LINK_REGEX = compile_regex("""
  49 (?<=
  50     \[
  51         (?:
  52             [^\[\]]*
  53         |
  54             \!\[
  55                 [^\[\]]*
  56             \]
  57             \(
  58                 [^()]*
  59             \)
  60         )
  61     \]
  62 )
  63 \(
  64     ([^()]*)
  65 \)
  66 """)
  67
  68
  69 dry_run = False
  70
  71
  72 def strip_header_link(link: str) -> str:
  73     "remove links to headers inside the file"
  74
  75     header_index = link.rfind('#')
  76     if header_index != -1:
  77         link = link[:header_index]
  78     return link
  79
  80
  81 def convert_page_name(path: str) -> str:
  82     "path can be with or without .md"
  83
  84     if path.startswith("_"):
  85         # ignore header, footer etc
  86         return path
  87
  88     if "-" in path:
  89         # don't wanna break stuff like mapping-entity-func_door
  90         return path
  91
  92     headerless = strip_header_link(path)
  93     # don't reformat these links because they're often linked to from outside
  94     for exc in ["Repository_Access", "Halogenes_Newbie_Corner"]:
  95         if headerless == exc or headerless == exc + ".md":
  96             return path
  97
  98     return basename(path).replace("_", "-")
  99
 100
 101 def convert_page_link(link: str) -> str:
 102     header_index = link.rfind('#')
 103     if header_index != -1:
 104         header = link[header_index + 1:]
 105         if "_" in header:
 106             print("warning: underscore in header: {}".format(link))
 107     return convert_page_name(link)
 108
 109
 110 def find_paths() -> Tuple[List[str], List[str]]:
 111     all_paths = sorted(filter(
 112         os.path.isfile,
 113         [name for name in glob.iglob('**', recursive=True)]))
 114     md_paths = sorted(filter(lambda s: s.endswith(".md"), all_paths))
 115     return all_paths, md_paths
 116
 117
 118 def fix_dir_structure():
 119     _, md_paths = find_paths()
 120     for path in md_paths:
 121         fixed = convert_page_name(path)
 122         if fixed == path:
 123             continue
 124
 125         if os.path.exists(fixed):
 126             print("warning: collision: {}".format(path))
 127         elif dry_run:
 128             print("would rename {} to {}".format(path, fixed))
 129         else:
 130             os.rename(path, fixed)
 131
 132
 133 def is_between_files(link: str) -> bool:
 134     if "://" in link or link.startswith("#"):
 135         # http(s) link or link to header on the same page
 136         return False
 137     else:
 138         return True
 139
 140
 141 def is_page_link(link: str) -> bool:
 142     # this is a best guess, i don't think there is a foolproof way to tell
 143
 144     if link.startswith("assets") or link.startswith("img"):
 145         # hopefully nobody adds more directories
 146         return False
 147     if "." in basename(link):
 148         # hopefully it's an extension
 149         return False
 150     # files in root without extension will fail
 151
 152     return True
 153
 154
 155 def replace_link(changes: List[str], match) -> str:
 156     text = match.group()
 157     link_start = match.start(1) - match.start()
 158     link_end = match.end(1) - match.start()
 159
 160     link = text[link_start:link_end]
 161
 162     if is_between_files(link) and is_page_link(link):
 163         new_link = convert_page_link(link)
 164         new_text = text[:link_start] + new_link + text[link_end:]
 165         if text != new_text:
 166             changes.append("\t{} -> {}".format(text, new_text))
 167         return new_text
 168     else:
 169         return text
 170
 171
 172 def fix_links():
 173     _, md_paths = find_paths()
 174     for path in md_paths:
 175         with open(path, 'r+') as f:
 176             contents = f.read()
 177
 178             changes = []
 179             replacer = functools.partial(replace_link, changes)
 180             contents_new = LINK_REGEX.sub(replacer, contents)
 181             if dry_run and any(changes):
 182                 print("would convert these links in {}:".format(path))
 183                 for change in changes:
 184                     print(change)
 185
 186             if not dry_run and contents != contents_new:
 187                 f.seek(0)
 188                 f.write(contents_new)
 189                 f.truncate()
 190
 191
 192 def link_to_path(current_file: str, link: str) -> str:
 193     #           nothing     .           ..          /
 194     # gitlab    root        current     current     root
 195     # gollum    current     current     current     root
 196     # github    ok          ok          broken      broken
 197
 198     # when not using subdirs, nothing or "." works for all 3
 199
 200     if link.startswith("..") or link.startswith("/"):
 201         print("file: {} bad link: {}", link)
 202
 203     # path relative to wiki root, not curent file
 204     current_dir = dirname(current_file)
 205     link = normpath(join(current_dir, link))
 206
 207     link = strip_header_link(link)
 208
 209     # page links don't have an extension - add it
 210     extension_index = link.rfind('.')
 211     if extension_index == -1:
 212         link = link + '.md'
 213
 214     return link
 215
 216
 217 def get_file_links(path: str) -> Generator[str, None, None]:
 218     with open(path, 'r') as f:
 219         contents = f.read()
 220         for match in LINK_REGEX.finditer(contents):
 221             link = match.group(1)
 222
 223             if is_between_files(link):
 224                 yield link
 225
 226
 227 def canonicalize(path: str) -> str:
 228     # spaces and capitalization don't seem to matter for pages
 229     if path.endswith(".md"):
 230         return path.replace(" ", "-").casefold()
 231     else:
 232         return path
 233
 234
 235 def find_broken(all_paths: List[str], md_paths: List[str]):
 236     canonical_paths = [canonicalize(path) for path in all_paths]
 237
 238     for path in md_paths:
 239         if path == "Irclog.md":
 240             continue  # TODO need to parse MD properly to avoid false posiives
 241         for link in get_file_links(path):
 242             link_target = canonicalize(link_to_path(path, link))
 243             if not link_target in canonical_paths:
 244                 #print("broken link in {}: {} -> {}".format(path, link, link_target))
 245                 print("broken link in {}: {}".format(path, link))
 246
 247
 248 def walk_links(canonical_to_real: Dict[str, str], is_linked: Dict[str, bool], current_path: str):
 249     canonical = canonicalize(current_path)
 250     if canonical not in canonical_to_real:
 251         # broken link - nothing to do here, we check broken links elsewhere
 252         # because here we're not guaranteed to walk through all files
 253         #print("not in known paths: {}".format(current_path))
 254         return
 255
 256     current_path = canonical_to_real[canonical]
 257
 258     if is_linked[current_path]:
 259         return
 260
 261     is_linked[current_path] = True
 262     if current_path.endswith(".md"):
 263         for link in get_file_links(current_path):
 264             link_target = link_to_path(current_path, link)
 265             walk_links(canonical_to_real, is_linked, link_target)
 266
 267
 268 def find_unlinked(all_paths: List[str]):
 269     canonical_to_real = {canonicalize(path): path for path in all_paths}
 270     is_linked = {path: False for path in all_paths}
 271
 272     # ignore these 2 - currently they don't show on GitLab but do on GitHub
 273     is_linked["_Footer.md"] = True
 274     is_linked["_Sidebar.md"] = True
 275
 276     walk_links(canonical_to_real, is_linked, "Home.md")
 277
 278     for path, linked in is_linked.items():
 279         if not linked:
 280             print("not reachable from Home: {}".format(path))
 281
 282
 283 def check_links():
 284     all_paths, md_paths = find_paths()
 285     find_broken(all_paths, md_paths)
 286     find_unlinked(all_paths)
 287
 288
 289 def main():
 290     global dry_run
 291     if len(sys.argv) > 1 and sys.argv[1] == "--dry-run":
 292         dry_run = True
 293
 294     # convert file paths - put everything into root
 295     fix_dir_structure()
 296
 297     # convert links on all pages
 298     fix_links()
 299
 300     # look for broken links and unlinked files
 301     check_links()
 302
 303
 304 if __name__ == '__main__':
 305     main()