#!/usr/bin/python3

from html import *
from span import *
from table import *
from util import *

__all__ = 'article'.split()

def markup_article(s: str) -> str:

    s = div(s)
    s = pretend(s)
    s = hush(s)
    return s

# Break the article into three kinds of divs.
#
# 1. tagged     First line starts with 'X  ', where X is a graph character
#               followed by at least two spaces. Remainder is indented.
#               Blank lines are allowed, although any trailing ones are
#               removed. First unindented line signals the div ended.
#               May or may not support digraphs, depending on the tag.
#
# 3. heading    Not indented. May have a digraph set as last line.
#               Rest is 2 lines: title and hyphens.
#               Hyphen spacing determines heading level.
#
# 4. paragraph  Not indented. May have a digraph set as last line.
#               Default div type.
def div(s: str) -> str:

    # Here are the lines of s, trailing whitespace removed.
    # They're in reverse order so we can use pop().
    lines: list[str] = [l.rstrip() for l in s.splitlines()][::-1]
    if not lines:
        return ''       # sanity


    divs: list[str] = []                # We have no divs yet.
    block: list[str] = []               # scratch space

    line = lines.pop()
    while True:
        # The invariant here is that 'line' is set to the next
        # unprocessed line. It does exist, although it may be blank.

        # If it is blank, we ignore it:
        if not line:
            if lines:
                line = lines.pop()
                continue
            break                       # end of file

        # 'line' is the first line of a block, guaranteed not blank.
        if line[1:3] == '  ' and not line[0].isspace():
            # This is a tagged div. Separate the tag from the line.
            tag = line[0]
            line = ' ' + line[1:]
        else:
           # 'line' starts either a HEADING or a PARAGRAPH.
            # Collect lines that follow while they are non-blank.
            while True:
                block.append(line)
                line = None
                if lines:
                    line = lines.pop()
                    if line:            # 'line' exists, but may be blank.
                        continue
                break                   # Blank line or EOF terminates block.

            # HEADINGS and PARAGRAPHS honor the digraph scheme for
            # disabling the special meaning of certain characters.
            block, current_specials = check_digraphs(block)

            # Detect and process heading blocks.
            heading = maybe_heading_div(block, current_specials)
            if heading:
                divs.append(heading)
                block = []
                continue

            # What's left is by definition a paragraph blocks.
            paragraph = paragraph_div(block, current_specials)
            divs.append(paragraph)
            block = []
            continue

        # This is a tagged div. There are several kinds.
        while True:
            block.append(line)
            line = None
            if lines:
                line = lines.pop()
                if line[:1].strip():    # An unindented visible character
                    break               # signals preformatting is over.
                continue
            break                       # End of file also ends preformatting.

        # We have the entire tagged div in this block, but there
        # may be trailing blank lines. We don't keep those.
        while block and not block[-1]:
            block.pop()

        # We need to know the minimum number of spaces that any line starts
        # with within this block so that we can dedent the markup out.
        dedent = min(len(x) - len(x.lstrip()) for x in block if len(x))

        # Dedent the lines in this block.
        for i in range(len(block)):
            block[i] = block[i][dedent:]

        # Formatting is done by a function that specializes in tagged divs.
        s = tagged_div(tag, block)
        divs.append(s)
        block = []

    # Spans were already processed by maybe_heading_div() and paragraph_div().
    # Just catenate and return the article.
    return ''.join(divs)                # return type is str

# A two-line block where the second line contains at most hyphens and spaces
# is a heading.
def maybe_heading_div(div: list[str], current_specials: dict[str, str]) -> str:

    if len(div) != 2:
        return None                     # This is not a heading.

    dashes = div[1].lstrip()
    if not dashes.startswith('-'):
        return None                     # Nor is this a heading.
    if dashes.replace('-', '').replace(' ', ''):
        return None                     # Nor is this a heading.

    # The heading level is 1 + the number of spaces after the first dash.
    level = 1
    while dashes[level:level+1] == ' ':
        level += 1

    # Handle any spans and format the heading, and return.
    s: str = span(current_specials, div[0])

    return '<h%i>%s</h%i>\n' % (level, s, level)

# A paragraph block is anything that wasn't identified as a different kind.
def paragraph_div(div: list[str], current_specials: dict[str, str]) -> str:

    # Combine the lines and handle any spans.
    return '<p>' + span(current_specials, ' '.join(div)) + '</p>\n'

# Tagged div types are:
#
#     *  bulleted list
#     #  numbered list
#     a  assembler code (like p, but requests syntax highlighting)
#     A  single line of assembler code (tighter vertical margins, indented)
#     p  preformatted text
#     P  single line of preformatted text (tighter vertical margins, indented)
#     t  table content
#     T  table class attachments (precedes 't' div)
#
def tagged_div(tag: str, block: list[str]) -> str:

    # Separate any digraphs from blocks that may have digraphs.
    if tag in '*#tT':
        block, current_specials = check_digraphs(block)

        # Do span markup for * (bulleted) and # (numbered) lists.
        if tag in '*#':
            for i in range(len(block)):
                block[i] = span(current_specials, block[i])

    # Preformatted & assembly don't catch span(), but still need xmlquote().
    if tag in 'pPaA':
        block = [xmlquote(s) for s in block]

    # Many tagged simply format via (prefix, separator, suffix).
    common = {
        '*': ('<ul>\n<li>', '</li>\n<li>', '</li>\n</ul>\n'),
        '#': ('<ol>\n<li>', '</li>\n<li>', '</li>\n</ol>\n'),
        'a': ('<pre class="asm">', '\n', '</pre>\n'),
        'A': ('<pre class="asm single">', '\n', '</pre>\n'),
        'p': ('<pre>', '\n', '</pre>\n'),
        'P': ('<pre class="single">', '\n', '</pre>\n'),
    }

    if tag in common:
        prefix, sep, suffix = common[tag]
        return prefix + sep.join(block) + suffix

    # We have a less simple case.
    if tag == 't':
        return markup_table(block, current_specials)

    if tag == 'T':
        save_table_class_script(block, current_specials)
        return ''

    # We have something unimplemented.
    return '<p><b>unrecognized div</b></p>'

# Digraphs sidestep the problem of quoting characters. Consider this markup:
#
#     To get italicized text, employ asterisks *like this*.
#
# The above doesn't succeed at including the asterisks in the HTML, 
# due to their special meaning for markup. We fix this by putting a digraph
# list as the last line of a block that temporarily reassigns the special
# markup meaning to a different character. The following does what we want:
#
#     To get italicized text, employ asterisks $*like this*$.
#     $*
#
# Order within digraphs doesn't matter, so *$ would have worked as well.
# We can tell which is which, because only one of the two symbols has
# special meaning. Separate digraphs with spaces if more than one
# symbol needs to be overriden.
#
# Note that the substituted symbol doesn't have to be used. If we
# didn't want the text 'like this' to actually appear in italics:
#
#     To get italicized text, employ asterisks *like this*.
#     $*
#
# This function modifies 'block' so that a digraph line doesn't appear
# in the HTML.
#
# Warning: The XML-reserved symbols < & > don't work as digraph replacements
# due to a chicken-egg dilemma. Making them work is sufficiently complex
# to not be a priority.
def check_digraphs(block: list[str]) -> tuple[list[str], dict[str, str]]:

    # If any word on the last line doesn't have length 2,
    # it's not a digraph line. Use the ordinary special characters.
    if any(len(word) != 2 for word in block[-1].split()):
        return block, specials

    # If we get here, it's probable the last line is a digraph list.
    # Try to make a list of current specials.
    cs: dict[str, str] = specials.copy()
    for a, b in block[-1].split():

        if a in specials and b not in specials:
            cs[a] = b                   # character b will have meaning a
        elif b in specials and a not in specials:
            cs[b] = a                   # character a will have meaning b
        else:
            return block, specials      # invalid digraph block

    # Return the block without the digraph line with the current specials.
    return block[:-1], cs

# Remove the unmatchable character BEL after all macros are processed.
def hush(s: str) -> str:
    return s.replace('\x07', '')

# pretend this is a standalone HTML file
def pretend(s: str) -> str:

    return article_html.replace('/ARTICLE\\', s)

# map symbols with special meaning onto themselves (for now)
specials: dict[str, str] = {}
for c in '-~"*`[]\'\u00b7':
    specials[c] = c

