rrxiv

scripts/merge-sidecar-edges.pyother · 4648 bytesRaw

1#!/usr/bin/env python3
2"""Merge edges from build/main.rrxiv.aux into build/main.cir.json.
3
4The rrxiv-python parser only extracts claim-to-claim edges where the
5\\dependson{}{} arguments are already in the canonical paper:label
6form. This paper uses short-form labels (I.1, I.47, etc.) because
7the proof DAG is dense and short labels keep the source readable.
8
9This post-processor reads the sidecar, filters to claim-to-claim
10edges only (drops post:*, def:*, cn:* targets), and prefixes them
11with the canonical paper id.
12
13Usage:
14  scripts/merge-sidecar-edges.py
15"""
16
17from __future__ import annotations
18
19import json
20import re
21from pathlib import Path
22
23PAPER_ID = "01923f8e-0009-7c4d-9e1f-3a2b1c0d4e5f"
24ROOT = Path(__file__).resolve().parent.parent
25CIR_PATH = ROOT / "build" / "main.cir.json"
26AUX_PATH = ROOT / "build" / "main.rrxiv.aux"
27META_PATH = ROOT / "rrxiv-meta.json"
28
29# Claim labels in book*.tex are uppercase Roman.Arabic — I.1, II.12, etc.
30# (Not post:N, cn:N, def:I.N — those are postulates/common notions/defs.)
31CLAIM_LABEL_RE = re.compile(r"^[IVXLC]+\.\d+(\.\d+)?$")
32EDGE_RE = re.compile(r"^RRXIV:edge:(depends_on|supports|contradicts|extends):([^|]+)\|(.+)$")
33
34
35def main() -> int:
36    if not CIR_PATH.is_file():
37        raise SystemExit(f"missing {CIR_PATH}")
38    if not AUX_PATH.is_file():
39        raise SystemExit(f"missing {AUX_PATH}")
40
41    cir = json.loads(CIR_PATH.read_text())
42
43    # Rewrite the canonical paper-level fields. The parser sets paper_id /
44    # claim.id prefixes to the rrxiv-meta slug ("rrxiv-paper-euclid-elements")
45    # which is fine for build artefacts but the deployed instance keys
46    # everything off the canonical UUID. Patch both top-level + each
47    # claim so re-ingest finds them by paper_id.
48    cir["id"] = PAPER_ID
49    cir.setdefault("id_slug", "rrxiv:2605.00009")
50
51    # Overlay structured authors + based_on + license + topics from
52    # rrxiv-meta.json. The parser captures the LaTeX \author{} arg
53    # verbatim — for Euclid that includes a \\\and\small annotation
54    # we want kept in the rendered PDF but cleaned out of the CIR.
55    # rrxiv-meta.json carries the canonical structured author list
56    # (one entry per author, with orcid + is_agent + agent_handle),
57    # so use it as the source of truth here.
58    if META_PATH.is_file():
59        meta = json.loads(META_PATH.read_text())
60        if isinstance(meta.get("authors"), list) and meta["authors"]:
61            cir["authors"] = meta["authors"]
62        for key in ("license", "topics", "based_on"):
63            if key in meta and meta[key] is not None:
64                cir[key] = meta[key]
65        # `version` from meta is authoritative too (e.g. "v2" after a
66        # revision); fall back to whatever the parser set otherwise.
67        if meta.get("version"):
68            cir["version"] = meta["version"]
69    for c in cir.get("claims", []):
70        c["paper_id"] = PAPER_ID
71        # `id` may be either parser-shape ("rrxiv-paper-euclid-elements:prop:I.1")
72        # or already canonical. Normalise to canonical.
73        idx = c["id"].rfind(":prop:")
74        if idx >= 0:
75            short = c["id"][idx + len(":prop:") :]
76            c["id"] = f"{PAPER_ID}:prop:{short}"
77        # Same rewriting for any inter-claim edges already on the claim.
78        for key in ("depends_on", "supports", "contradicts", "extends"):
79            c.setdefault(key, [])
80            c[key] = [
81                t if ":prop:" not in t
82                else f"{PAPER_ID}:prop:{t.rsplit(':prop:', 1)[1]}"
83                for t in c[key]
84            ]
85
86    claims_by_short: dict[str, dict] = {}
87    for c in cir.get("claims", []):
88        idx = c["id"].rfind(":prop:")
89        if idx >= 0:
90            short = c["id"][idx + len(":prop:") :]
91            claims_by_short[short] = c
92
93    merged = 0
94    skipped = 0
95    for line in AUX_PATH.read_text().splitlines():
96        m = EDGE_RE.match(line)
97        if not m:
98            continue
99        kind, src, tgt = m.group(1), m.group(2).strip(), m.group(3).strip()
100        # Only claim → claim edges.
101        if not (CLAIM_LABEL_RE.match(src) and CLAIM_LABEL_RE.match(tgt)):
102            skipped += 1
103            continue
104        claim = claims_by_short.get(src)
105        if claim is None:
106            skipped += 1
107            continue
108        full_target = f"{PAPER_ID}:prop:{tgt}"
109        if full_target not in claim[kind]:
110            claim[kind].append(full_target)
111            merged += 1
112
113    CIR_PATH.write_text(json.dumps(cir, indent=2) + "\n")
114    print(f"merged {merged} claim-to-claim edges; skipped {skipped} non-claim edges")
115    return 0
116
117
118if __name__ == "__main__":
119    raise SystemExit(main())
120

scripts/merge-sidecar-edges.pyother · 4648 bytesRaw

1#!/usr/bin/env python3
2"""Merge edges from build/main.rrxiv.aux into build/main.cir.json.
3
4The rrxiv-python parser only extracts claim-to-claim edges where the
5\\dependson{}{} arguments are already in the canonical paper:label
6form. This paper uses short-form labels (I.1, I.47, etc.) because
7the proof DAG is dense and short labels keep the source readable.
8
9This post-processor reads the sidecar, filters to claim-to-claim
10edges only (drops post:*, def:*, cn:* targets), and prefixes them
11with the canonical paper id.
12
13Usage:
14  scripts/merge-sidecar-edges.py
15"""
16
17from __future__ import annotations
18
19import json
20import re
21from pathlib import Path
22
23PAPER_ID = "01923f8e-0009-7c4d-9e1f-3a2b1c0d4e5f"
24ROOT = Path(__file__).resolve().parent.parent
25CIR_PATH = ROOT / "build" / "main.cir.json"
26AUX_PATH = ROOT / "build" / "main.rrxiv.aux"
27META_PATH = ROOT / "rrxiv-meta.json"
28
29# Claim labels in book*.tex are uppercase Roman.Arabic — I.1, II.12, etc.
30# (Not post:N, cn:N, def:I.N — those are postulates/common notions/defs.)
31CLAIM_LABEL_RE = re.compile(r"^[IVXLC]+\.\d+(\.\d+)?$")
32EDGE_RE = re.compile(r"^RRXIV:edge:(depends_on|supports|contradicts|extends):([^|]+)\|(.+)$")
33
34
35def main() -> int:
36    if not CIR_PATH.is_file():
37        raise SystemExit(f"missing {CIR_PATH}")
38    if not AUX_PATH.is_file():
39        raise SystemExit(f"missing {AUX_PATH}")
40
41    cir = json.loads(CIR_PATH.read_text())
42
43    # Rewrite the canonical paper-level fields. The parser sets paper_id /
44    # claim.id prefixes to the rrxiv-meta slug ("rrxiv-paper-euclid-elements")
45    # which is fine for build artefacts but the deployed instance keys
46    # everything off the canonical UUID. Patch both top-level + each
47    # claim so re-ingest finds them by paper_id.
48    cir["id"] = PAPER_ID
49    cir.setdefault("id_slug", "rrxiv:2605.00009")
50
51    # Overlay structured authors + based_on + license + topics from
52    # rrxiv-meta.json. The parser captures the LaTeX \author{} arg
53    # verbatim — for Euclid that includes a \\\and\small annotation
54    # we want kept in the rendered PDF but cleaned out of the CIR.
55    # rrxiv-meta.json carries the canonical structured author list
56    # (one entry per author, with orcid + is_agent + agent_handle),
57    # so use it as the source of truth here.
58    if META_PATH.is_file():
59        meta = json.loads(META_PATH.read_text())
60        if isinstance(meta.get("authors"), list) and meta["authors"]:
61            cir["authors"] = meta["authors"]
62        for key in ("license", "topics", "based_on"):
63            if key in meta and meta[key] is not None:
64                cir[key] = meta[key]
65        # `version` from meta is authoritative too (e.g. "v2" after a
66        # revision); fall back to whatever the parser set otherwise.
67        if meta.get("version"):
68            cir["version"] = meta["version"]
69    for c in cir.get("claims", []):
70        c["paper_id"] = PAPER_ID
71        # `id` may be either parser-shape ("rrxiv-paper-euclid-elements:prop:I.1")
72        # or already canonical. Normalise to canonical.
73        idx = c["id"].rfind(":prop:")
74        if idx >= 0:
75            short = c["id"][idx + len(":prop:") :]
76            c["id"] = f"{PAPER_ID}:prop:{short}"
77        # Same rewriting for any inter-claim edges already on the claim.
78        for key in ("depends_on", "supports", "contradicts", "extends"):
79            c.setdefault(key, [])
80            c[key] = [
81                t if ":prop:" not in t
82                else f"{PAPER_ID}:prop:{t.rsplit(':prop:', 1)[1]}"
83                for t in c[key]
84            ]
85
86    claims_by_short: dict[str, dict] = {}
87    for c in cir.get("claims", []):
88        idx = c["id"].rfind(":prop:")
89        if idx >= 0:
90            short = c["id"][idx + len(":prop:") :]
91            claims_by_short[short] = c
92
93    merged = 0
94    skipped = 0
95    for line in AUX_PATH.read_text().splitlines():
96        m = EDGE_RE.match(line)
97        if not m:
98            continue
99        kind, src, tgt = m.group(1), m.group(2).strip(), m.group(3).strip()
100        # Only claim → claim edges.
101        if not (CLAIM_LABEL_RE.match(src) and CLAIM_LABEL_RE.match(tgt)):
102            skipped += 1
103            continue
104        claim = claims_by_short.get(src)
105        if claim is None:
106            skipped += 1
107            continue
108        full_target = f"{PAPER_ID}:prop:{tgt}"
109        if full_target not in claim[kind]:
110            claim[kind].append(full_target)
111            merged += 1
112
113    CIR_PATH.write_text(json.dumps(cir, indent=2) + "\n")
114    print(f"merged {merged} claim-to-claim edges; skipped {skipped} non-claim edges")
115    return 0
116
117
118if __name__ == "__main__":
119    raise SystemExit(main())
120