New mdsplit version.

This commit is contained in:
Sekun 2024-10-24 20:58:06 +02:00
parent 17c72ba06e
commit a9f92aa16a

199
prog/mdsplit.py Normal file → Executable file
View file

@ -7,7 +7,9 @@
# /// # ///
import argparse import argparse
import re from collections import defaultdict
from dataclasses import dataclass
from enum import Enum
from pathlib import Path from pathlib import Path
from slugify import slugify from slugify import slugify
@ -18,6 +20,15 @@ parser = argparse.ArgumentParser(
parser.add_argument("mdfile", help="The org file", type=Path) parser.add_argument("mdfile", help="The org file", type=Path)
parser.add_argument("mdbook", help="mdbook root diretory", type=Path) parser.add_argument("mdbook", help="mdbook root diretory", type=Path)
parser.add_argument(
"-d",
"--max-depth",
help="Max depth for headings",
type=int,
default=1,
dest="depth",
)
args = parser.parse_args() args = parser.parse_args()
if not args.mdfile.is_file(): if not args.mdfile.is_file():
@ -28,70 +39,152 @@ if not args.mdbook.is_dir():
"`mdbook` must be a root mdbook directory initialiezd with `mdbook init`" "`mdbook` must be a root mdbook directory initialiezd with `mdbook init`"
) )
with open(args.mdfile) as f: if args.depth < 1:
data = f.read() raise ValueError("`depth` must be >= 1")
data = data.split("```")
for i, d in enumerate(data[:]):
if i % 2 == 0:
continue
data[i] = "```" + re.sub(r"^(#+) ", r"\1", data[i], flags=re.MULTILINE) + "```"
data = "".join(data).splitlines(keepends=True) @dataclass
class ExtraTitle:
level: int
title: str
output = Path(parser.mdbook) / "src"
splitn = [idx for (idx, d) in enumerate(data) if d.startswith("# ")] class DType(Enum):
splitn = list(zip(splitn[:], splitn[1:] + [None])) CODE = 1
BODY = 2
TITLE = 3
@dataclass
class Content:
content: str
dtype: DType
extra: None | ExtraTitle = None
@dataclass
class Config:
inside_code: bool
OUTPUT_DIR: Path = Path("book/src")
MAX_LEVEL: int = args.depth
with open(args.mdfile, "r") as f:
print(args.mdfile)
mdlines: list[str] = f.readlines()
parsed_lines: list[Content] = []
config: Config = Config(inside_code=False)
def if_parse_begin_code(
line: str,
config: Config,
parsed_lines: list[Content],
):
if line.startswith("```"):
config.inside_code = True
parsed_lines.append(Content(content=line, dtype=DType.CODE))
return True
return False
def if_parse_title(line: str, config: dict[str, bool], parsed_lines: list[Content]):
if line.startswith("#"):
title = " ".join(line.split(" ")[1:])
level = len(line.split(" ")[0])
parsed_lines.append(
Content(
content=line,
dtype=DType.TITLE,
extra=ExtraTitle(level=level, title=title),
)
)
return True
return False
def parse_code(line, config, parse_lines):
parsed_lines.append(Content(content=line, dtype=DType.CODE))
return True
def parse_content(line, config, parse_lines):
parsed_lines.append(Content(content=line, dtype=DType.BODY))
return True
def if_parse_end_code(line, config, parse_lines):
if line.startswith("```"):
parsed_lines.append(Content(content=line, dtype=DType.CODE))
config.inside_code = False
return True
return False
def do_parse_nocode(line, config, parse_lines):
return any(
f(line, config, parse_lines)
for f in (
if_parse_begin_code,
if_parse_title,
parse_content,
)
)
for line in mdlines:
if not config.inside_code:
do_parse_nocode(line, config, parsed_lines)
else:
any(f(line, config, parsed_lines) for f in (if_parse_end_code, parse_code))
filepath_suffix = "_prelude.md"
filepath = OUTPUT_DIR / ("0" + filepath_suffix)
num_titles = defaultdict(int)
summaries = [] summaries = []
for idx, (start, end) in enumerate(splitn[:], start=1):
d = data[start:end]
title = d[0][2:].rstrip()
num = f"{idx:02d}"
basename = f"{num}.{slugify(title)}.md"
summary = {"title": title, "basename": basename, "subs": []}
subcontent = d[1:] for parsed_line in parsed_lines:
if parsed_line.dtype == DType.TITLE and parsed_line.extra.level <= MAX_LEVEL:
level = parsed_line.extra.level
name = d[0] num_titles[level] += 1
splitn = [idx for (idx, d) in enumerate(subcontent) if d.startswith("## ")] num_title = num_titles[level]
title = parsed_line.extra.title.strip()
slug_title = slugify(title)
filepath_suffix = f"_{slug_title}.md"
with open(output / basename, "w") as f: # reset key of num_titles if key > level
print(f"# {title}", file=f) keys = {k for k in num_titles.keys() if k > level}
for key in keys:
del num_titles[key]
if splitn: filepath_prefix = ".".join(
d = "".join(subcontent[: splitn[0]]) f"{i:02d}" for _, i in sorted(num_titles.items(), key=lambda x: x[0])
print(d, file=f)
else:
print("".join(subcontent), file=f)
splitn = list(zip(splitn[:], splitn[1:] + [None]))
for jdx, (start, end) in enumerate(splitn[:], start=1):
d = subcontent[start:end]
title = d[0][2:].rstrip()
basename = f"{idx:02d}.{jdx:02d}.{slugify(title)}.md"
summary["subs"].append(
{
"title": title,
"basename": basename,
}
) )
with open(output / basename, "w") as f: basename = filepath_prefix + filepath_suffix
print("".join(d), file=f) filepath = OUTPUT_DIR / basename
summaries.append(summary) summaries.append((level, title, basename))
with open(output / "SUMMARY.md", "w") as f: with open(filepath, "a") as f:
print("# SUMMARY", file=f) f.write(parsed_line.content)
for item in summaries:
title = item["title"] else:
basename = item["basename"] with open(filepath, "a") as f:
print(f"- [{title}](./{basename})", file=f) f.write(parsed_line.content)
for sub in item["subs"]:
title = sub["title"] with open(OUTPUT_DIR / "SUMMARY.md", "w") as f:
basename = sub["basename"] for level, title, basename in summaries:
print(f" - [{title}](./{basename})", file=f) print(level * " ", f"- [{title}](./{basename})", file=f)