#!/usr/bin/env python3

# Run in wiki root

# Well, this wasn't supposed to be so long and complicated.
# Anyway, it makes sure the wiki works on both Gitlab and Github by moving
# stuff around and fixing links. Then it reports all remaining broken links
# and unused files. Since the wiki is in git, you can use `git status`
# and `git diff` to see the changes. You can also use the `--dry-run` flag
# to print all changes the script would make without actually making them.

# See Editing.md for more information.

# Some stuff that could have been done better:
#  - Not parsing Markdown with regex. Currently, we for example report
#    broken links even though they're inside code blocks (e.g. Irclog.md)
#  - Using the type system (and mypy) to distinguish different link types
#    to make sure the right functions are called with the right link types
#    (e.g. page links, file links, links with headers, urls, ...)
#  - Checking outbound links for 404s.

import sys
import os
import glob
import regex  # sudo pip3 install regex
import functools
from typing import *
from os.path import normpath, join, dirname, basename


# yeah, well, this is ugly but sure beats putting the regex on one line
def compile_regex(rgx: str):
    # regex (unlike re) supports non-constant length look-behinds
    return regex.compile(
        "".join(
            [line.strip() for line in rgx]))


# examples:
# [Page link](Some_Page)
# [Url link](http://example.com)
# ![Image](image_1.png)
# [![Image link to image](image_inner.png)](image_outer.png)
# [![Image link to page](image_inner.png)](Archive/Some_Page)

# regex.sub doesnt support overlapping - we have to use lookbehinds.
# Practically, the inner link will never be a page so we don't need to
# sub it, but later we can reuse the regex to go through all the links
# and check that they're valid.
LINK_REGEX = compile_regex("""
(?<=
    \[
        (?:
            [^\[\]]*
        |
            \!\[
                [^\[\]]*
            \]
            \(
                [^()]*
            \)
        )
    \]
)
\(
    ([^()]*)
\)
""")


dry_run = False


def strip_header_link(link: str) -> str:
    "remove links to headers inside the file"

    header_index = link.rfind('#')
    if header_index != -1:
        link = link[:header_index]
    return link


def convert_page_name(path: str) -> str:
    "path can be with or without .md"

    if path.startswith("_"):
        # ignore header, footer etc
        return path

    if "-" in path:
        # don't wanna break stuff like mapping-entity-func_door
        return path

    headerless = strip_header_link(path)
    # don't reformat these links because they're often linked to from outside
    for exc in ["Repository_Access", "Halogenes_Newbie_Corner"]:
        if headerless == exc or headerless == exc + ".md":
            return path

    return basename(path).replace("_", "-")


def convert_page_link(link: str) -> str:
    header_index = link.rfind('#')
    if header_index != -1:
        header = link[header_index + 1:]
        if "_" in header:
            print("warning: underscore in header: {}".format(link))
    return convert_page_name(link)


def find_paths() -> Tuple[List[str], List[str]]:
    all_paths = sorted(filter(
        os.path.isfile,
        [name for name in glob.iglob('**', recursive=True)]))
    md_paths = sorted(filter(lambda s: s.endswith(".md"), all_paths))
    return all_paths, md_paths


def fix_dir_structure():
    _, md_paths = find_paths()
    for path in md_paths:
        fixed = convert_page_name(path)
        if fixed == path:
            continue

        if os.path.exists(fixed):
            print("warning: collision: {}".format(path))
        elif dry_run:
            print("would rename {} to {}".format(path, fixed))
        else:
            os.rename(path, fixed)


def is_between_files(link: str) -> bool:
    if "://" in link or link.startswith("#"):
        # http(s) link or link to header on the same page
        return False
    else:
        return True


def is_page_link(link: str) -> bool:
    # this is a best guess, i don't think there is a foolproof way to tell

    if link.startswith("assets") or link.startswith("img"):
        # hopefully nobody adds more directories
        return False
    if "." in basename(link):
        # hopefully it's an extension
        return False
    # files in root without extension will fail

    return True


def replace_link(changes: List[str], match) -> str:
    text = match.group()
    link_start = match.start(1) - match.start()
    link_end = match.end(1) - match.start()

    link = text[link_start:link_end]

    if is_between_files(link) and is_page_link(link):
        new_link = convert_page_link(link)
        new_text = text[:link_start] + new_link + text[link_end:]
        if text != new_text:
            changes.append("\t{} -> {}".format(text, new_text))
        return new_text
    else:
        return text


def fix_links():
    _, md_paths = find_paths()
    for path in md_paths:
        with open(path, 'r+') as f:
            contents = f.read()

            changes = []
            replacer = functools.partial(replace_link, changes)
            contents_new = LINK_REGEX.sub(replacer, contents)
            if dry_run and any(changes):
                print("would convert these links in {}:".format(path))
                for change in changes:
                    print(change)

            if not dry_run and contents != contents_new:
                f.seek(0)
                f.write(contents_new)
                f.truncate()


def link_to_path(current_file: str, link: str) -> str:
    #           nothing     .           ..          /
    # gitlab    root        current     current     root
    # gollum    current     current     current     root
    # github    ok          ok          broken      broken

    # when not using subdirs, nothing or "." works for all 3

    if link.startswith("..") or link.startswith("/"):
        print("file: {} bad link: {}", link)

    # path relative to wiki root, not curent file
    current_dir = dirname(current_file)
    link = normpath(join(current_dir, link))

    link = strip_header_link(link)

    # page links don't have an extension - add it
    extension_index = link.rfind('.')
    if extension_index == -1:
        link = link + '.md'

    return link


def get_file_links(path: str) -> Generator[str, None, None]:
    with open(path, 'r') as f:
        contents = f.read()
        for match in LINK_REGEX.finditer(contents):
            link = match.group(1)

            if is_between_files(link):
                yield link


def canonicalize(path: str) -> str:
    # spaces and capitalization don't seem to matter for pages
    if path.endswith(".md"):
        return path.replace(" ", "-").casefold()
    else:
        return path


def find_broken(all_paths: List[str], md_paths: List[str]):
    canonical_paths = [canonicalize(path) for path in all_paths]

    for path in md_paths:
        if path == "Irclog.md":
            continue  # TODO need to parse MD properly to avoid false posiives
        for link in get_file_links(path):
            link_target = canonicalize(link_to_path(path, link))
            if not link_target in canonical_paths:
                #print("broken link in {}: {} -> {}".format(path, link, link_target))
                print("broken link in {}: {}".format(path, link))


def walk_links(canonical_to_real: Dict[str, str], is_linked: Dict[str, bool], current_path: str):
    canonical = canonicalize(current_path)
    if canonical not in canonical_to_real:
        # broken link - nothing to do here, we check broken links elsewhere
        # because here we're not guaranteed to walk through all files
        #print("not in known paths: {}".format(current_path))
        return

    current_path = canonical_to_real[canonical]

    if is_linked[current_path]:
        return

    is_linked[current_path] = True
    if current_path.endswith(".md"):
        for link in get_file_links(current_path):
            link_target = link_to_path(current_path, link)
            walk_links(canonical_to_real, is_linked, link_target)


def find_unlinked(all_paths: List[str]):
    canonical_to_real = {canonicalize(path): path for path in all_paths}
    is_linked = {path: False for path in all_paths}

    # ignore these 2 - currently they don't show on GitLab but do on GitHub
    is_linked["_Footer.md"] = True
    is_linked["_Sidebar.md"] = True

    walk_links(canonical_to_real, is_linked, "Home.md")

    for path, linked in is_linked.items():
        if not linked:
            print("not reachable from Home: {}".format(path))


def check_links():
    all_paths, md_paths = find_paths()
    find_broken(all_paths, md_paths)
    find_unlinked(all_paths)


def main():
    global dry_run
    if len(sys.argv) > 1 and sys.argv[1] == "--dry-run":
        dry_run = True

    # convert file paths - put everything into root
    fix_dir_structure()

    # convert links on all pages
    fix_links()

    # look for broken links and unlinked files
    check_links()


if __name__ == '__main__':
    main()