// ---
// tags: optica, rust
// crystal-type: source
// crystal-domain: comp
// ---
use crate::parser::{PageId, ParsedPage};
use crate::scanner::{DiscoveredFile, DiscoveredFiles, FileKind};
use anyhow::Result;
use globset::{Glob, GlobSetBuilder};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use walkdir::WalkDir;
/// Per-subgraph parse cache: path β (mtime, parsed page).
/// Re-use cached entries when mtime matches to avoid re-parsing unchanged files.
pub type SubgraphParseCache = HashMap<PathBuf, (SystemTime, ParsedPage)>;
/// Declaration of an external repository to include as a subgraph.
#[derive(Debug, Clone)]
pub struct SubgraphDecl {
pub name: String,
/// URL mount prefix for all pages in this subgraph.
/// Empty string = root-mounted (no prefix). Defaults to `name`.
pub mount: String,
pub repo_path: PathBuf,
pub exclude_patterns: Vec<String>,
pub declaring_page_id: PageId,
pub is_private: bool,
/// When true, optica adds the "nav" tag to this subgraph's root page.
pub menu: bool,
/// Position in the subgraph config β used as menu_order so nav items appear in declaration order.
pub menu_order: Option<i32>,
}
/// Default exclude patterns applied to all subgraphs.
pub const DEFAULT_EXCLUDES: &[&str] = &[
".git/**",
"target/**",
"**/target/**",
"node_modules/**",
"**/node_modules/**",
"build/**",
"**/build/**",
"dist/**",
"**/dist/**",
".next/**",
"**/.next/**",
"out/**",
"**/out/**",
"**/.DS_Store",
"Cargo.lock",
"**/Cargo.lock",
// Web build output. Optica generates its own index.html for every page,
// so subgraphs containing static-site output conflict with rendering.
// Knowledge graphs use markdown; HTML in a subgraph is almost always
// a compiled artifact, not authored content.
"**/index.html",
"**/index.htm",
// Built JS bundles.
"**/*.min.js",
"**/*.min.css",
// Lock files for JS toolchains.
"**/package-lock.json",
"**/pnpm-lock.yaml",
"**/yarn.lock",
// Vendored dependencies. Not authored content; each file would render
// to a full HTML page and balloon output size by an order of magnitude.
"vendor/**",
"**/vendor/**",
// Python runtime caches and envs.
"**/__pycache__/**",
"**/*.pyc",
"**/*.pyo",
"**/.venv/**",
"**/venv/**",
// Compiled binaries and object files.
"**/*.wasm",
"**/*.so",
"**/*.dylib",
"**/*.a",
"**/*.rlib",
"**/*.o",
"**/*.exe",
"**/*.dll",
// Coverage and benchmark output.
"**/coverage/**",
"**/.nyc_output/**",
];
/// Load subgraph declarations from a TOML config file. With no path, returns
/// an empty list β optica acts on a single repo with no embedded subgraphs.
///
/// One call site for decl loading prevents the regression class where a new
/// code path forgets workspace mode and silently drops every subgraph.
pub fn load_subgraph_decls(subgraphs_path: Option<&Path>) -> Result<Vec<SubgraphDecl>> {
match subgraphs_path {
Some(path) => crate::scanner::subgraph_config::load(path),
None => Ok(Vec::new()),
}
}
/// Stats reported by `ingest_subgraph` for one decl. Callers print or aggregate.
pub struct SubgraphScanStats {
pub name: String,
pub page_count: usize,
pub file_count: usize,
}
/// Result of ingesting one subgraph: all new pages it contributes (md + non-md)
/// plus a stats summary. The declaring page (if present in `root_pages`) is
/// removed in place so the README produced by this ingestion takes its slot.
pub struct SubgraphIngestion {
pub pages: Vec<ParsedPage>,
pub stats: SubgraphScanStats,
}
/// Scan, parse, and merge a single subgraph. Encapsulates every step that used
/// to be duplicated across build, check, watch warm-up, and incremental rebuild:
///
/// 1. WalkDir over the subgraph repo, classifying files
/// 2. Parse markdown pages; non-md files become code-fence preview pages
/// 3. If a page's id matches the decl's `declaring_page_id`, copy the root
/// declaring page's metadata + outgoing links and prepend its body
/// 4. Remove the declaring page from `root_pages` so its slot is freed
///
/// One canonical implementation β every caller routes through here.
pub fn ingest_subgraph(
decl: &SubgraphDecl,
root_pages: &mut Vec<ParsedPage>,
) -> Result<SubgraphIngestion> {
let subgraph_files = scan_subgraph(decl)?;
let page_count = subgraph_files
.iter()
.filter(|f| f.kind == FileKind::Page)
.count();
let file_count = subgraph_files
.iter()
.filter(|f| f.kind == FileKind::File)
.count();
// Capture the declaring page so we can hoist its metadata into the README.
// Cloned because we will later evict it from root_pages.
let declaring_page = root_pages
.iter()
.find(|p| p.id == decl.declaring_page_id)
.cloned();
let decl_slug = crate::parser::slugify_page_name(&decl.declaring_page_id);
let mut pages = Vec::with_capacity(page_count + file_count);
// Step 1: markdown pages, with README merging the declaring page's metadata
for file in &subgraph_files {
if file.kind != FileKind::Page {
continue;
}
let mut page = crate::parser::parse_file(file)?;
if page.id == decl.declaring_page_id || page.id == decl_slug {
if let Some(ref dp) = declaring_page {
page.meta.tags = dp.meta.tags.clone();
page.meta.aliases = dp.meta.aliases.clone();
page.meta.properties = dp.meta.properties.clone();
page.meta.public = dp.meta.public;
page.meta.icon = dp.meta.icon.clone();
page.meta.stake = dp.meta.stake;
if !dp.content_md.trim().is_empty() {
let readme_content = std::mem::take(&mut page.content_md);
page.content_md = crate::parser::merge_subgraph_content(
&dp.content_md,
&decl.name,
&readme_content,
);
}
for link in &dp.outgoing_links {
if !page.outgoing_links.contains(link) {
page.outgoing_links.push(link.clone());
}
}
}
if decl.menu {
if !page.meta.tags.contains(&"nav".to_string()) {
page.meta.tags.push("nav".to_string());
}
if page.meta.menu_order.is_none() {
page.meta.menu_order = decl.menu_order;
}
}
}
pages.push(page);
}
// Step 2: evict the declaring page from root β the subgraph README owns it now
if declaring_page.is_some() {
root_pages.retain(|p| !(p.id == decl.declaring_page_id && p.subgraph.is_none()));
}
// Step 3: non-markdown files as code-preview pages
let sg_files: Vec<DiscoveredFile> = subgraph_files
.into_iter()
.filter(|f| f.kind == FileKind::File)
.collect();
let sg_discovered = DiscoveredFiles {
pages: Vec::new(),
journals: Vec::new(),
media: Vec::new(),
files: sg_files,
};
pages.extend(crate::parser::parse_all(&sg_discovered)?);
Ok(SubgraphIngestion {
pages,
stats: SubgraphScanStats {
name: decl.name.clone(),
page_count,
file_count,
},
})
}
/// Cached variant of `ingest_subgraph`: re-parses only files whose mtime
/// changed since the last ingest. For large subgraphs (cyb: 5k files) this
/// reduces re-ingest cost from ~30s to <1s on single-file edits.
pub fn ingest_subgraph_cached(
decl: &SubgraphDecl,
root_pages: &mut Vec<ParsedPage>,
parse_cache: &mut SubgraphParseCache,
) -> Result<SubgraphIngestion> {
let subgraph_files = scan_subgraph(decl)?;
let page_count = subgraph_files
.iter()
.filter(|f| f.kind == FileKind::Page)
.count();
let file_count = subgraph_files
.iter()
.filter(|f| f.kind == FileKind::File)
.count();
let declaring_page = root_pages
.iter()
.find(|p| p.id == decl.declaring_page_id)
.cloned();
let decl_slug = crate::parser::slugify_page_name(&decl.declaring_page_id);
let mut pages = Vec::with_capacity(page_count + file_count);
for file in &subgraph_files {
if file.kind != FileKind::Page {
continue;
}
let mtime = std::fs::metadata(&file.path)
.ok()
.and_then(|m| m.modified().ok())
.unwrap_or(SystemTime::UNIX_EPOCH);
let mut page = if let Some((cached_mtime, cached_page)) = parse_cache.get(&file.path) {
if *cached_mtime == mtime {
cached_page.clone()
} else {
let p = crate::parser::parse_file(file)?;
parse_cache.insert(file.path.clone(), (mtime, p.clone()));
p
}
} else {
let p = crate::parser::parse_file(file)?;
parse_cache.insert(file.path.clone(), (mtime, p.clone()));
p
};
if page.id == decl.declaring_page_id || page.id == decl_slug {
if let Some(ref dp) = declaring_page {
page.meta.tags = dp.meta.tags.clone();
page.meta.aliases = dp.meta.aliases.clone();
page.meta.properties = dp.meta.properties.clone();
page.meta.public = dp.meta.public;
page.meta.icon = dp.meta.icon.clone();
page.meta.stake = dp.meta.stake;
if !dp.content_md.trim().is_empty() {
let readme_content = std::mem::take(&mut page.content_md);
page.content_md = crate::parser::merge_subgraph_content(
&dp.content_md,
&decl.name,
&readme_content,
);
}
for link in &dp.outgoing_links {
if !page.outgoing_links.contains(link) {
page.outgoing_links.push(link.clone());
}
}
}
}
pages.push(page);
}
if declaring_page.is_some() {
root_pages.retain(|p| !(p.id == decl.declaring_page_id && p.subgraph.is_none()));
}
// Prune stale entries for deleted files
let current_paths: std::collections::HashSet<&PathBuf> =
subgraph_files.iter().map(|f| &f.path).collect();
parse_cache.retain(|p, _| current_paths.contains(p));
let sg_files: Vec<DiscoveredFile> = subgraph_files
.into_iter()
.filter(|f| f.kind == FileKind::File)
.collect();
let sg_discovered = DiscoveredFiles {
pages: Vec::new(),
journals: Vec::new(),
media: Vec::new(),
files: sg_files,
};
pages.extend(crate::parser::parse_all(&sg_discovered)?);
Ok(SubgraphIngestion {
pages,
stats: SubgraphScanStats {
name: decl.name.clone(),
page_count,
file_count,
},
})
}
/// Resolve the graph directory inside a subgraph repo, using the same
/// fallback chain as the main scanner: root β graph β pages β repo root.
fn resolve_subgraph_graph_dir(repo_path: &Path) -> PathBuf {
for name in &["root", "graph", "pages"] {
let dir = repo_path.join(name);
if dir.exists() {
return dir;
}
}
// No dedicated page directory β pages live at repo root
repo_path.to_path_buf()
}
/// Scan an external repository and return discovered files under the subgraph namespace.
/// All files are collected; markdown files become Pages, everything else becomes Files.
pub fn scan_subgraph(decl: &SubgraphDecl) -> Result<Vec<DiscoveredFile>> {
if !decl.repo_path.exists() {
eprintln!(
"Warning: subgraph '{}' repo path does not exist: {} β skipping",
decl.name,
decl.repo_path.display()
);
return Ok(vec![]);
}
let graph_dir = resolve_subgraph_graph_dir(&decl.repo_path);
// Build exclude glob set
let mut builder = GlobSetBuilder::new();
for pattern in &decl.exclude_patterns {
if let Ok(glob) = Glob::new(pattern) {
builder.add(glob);
}
}
let exclude_set = builder.build()?;
// Directories to skip entirely β prevents WalkDir from descending into
// .git/objects, target/, node_modules/ etc. which can contain thousands of files.
let skip_dirs: std::collections::HashSet<&str> =
[".git", "target", "node_modules", "build"].into();
let mut files = Vec::new();
for entry in WalkDir::new(&decl.repo_path)
.into_iter()
.filter_entry(|e| {
if e.file_type().is_dir() {
let name = e.file_name().to_string_lossy();
!skip_dirs.contains(name.as_ref())
} else {
true
}
})
.filter_map(|e| e.ok())
.filter(|e| e.file_type().is_file())
{
let path = entry.path().to_path_buf();
// Get path relative to repo root for exclusion matching
let relative = path
.strip_prefix(&decl.repo_path)
.unwrap_or(&path)
.to_string_lossy();
if exclude_set.is_match(relative.as_ref()) {
continue;
}
let is_md = path
.extension()
.map(|ext| ext == "md" || ext == "markdown")
.unwrap_or(false);
if is_md {
// Pages inside graph_dir get names relative to graph_dir
// (strips the root/graph/pages prefix), others relative to repo root
let base = if path.starts_with(&graph_dir) {
&graph_dir
} else {
&decl.repo_path
};
let name = subgraph_page_name(&path, base, &decl.mount);
files.push(DiscoveredFile {
path,
kind: FileKind::Page,
name,
subgraph: Some(decl.name.clone()),
});
} else {
let name = subgraph_file_name(&path, &decl.repo_path, &decl.mount);
files.push(DiscoveredFile {
path,
kind: FileKind::File,
name,
subgraph: Some(decl.name.clone()),
});
}
}
Ok(files)
}
/// Derive page name for a markdown file in a subgraph.
/// README.md at any level becomes the directory's page.
/// `mount` is the URL prefix for this subgraph; empty string means root-mounted.
///
/// e.g., mount="trident", ~/git/trident/README.md β "trident"
/// e.g., mount="trident", ~/git/trident/docs/README.md β "trident/docs"
/// e.g., mount="", ~/git/crystal/README.md β "" (root page)
/// e.g., mount="", ~/git/crystal/cyber/README.md β "cyber"
/// e.g., mount="w/zoya", ~/git/zoya/README.md β "w/zoya"
fn subgraph_page_name(path: &Path, repo_root: &Path, mount: &str) -> String {
let relative = path.strip_prefix(repo_root).unwrap_or(path);
let stem = relative.with_extension("");
let name = stem.to_string_lossy();
// README at any level becomes the parent directory's page
if name.eq_ignore_ascii_case("README") {
return mount.to_string();
}
if let Some(parent) = name.strip_suffix("/README").or_else(|| name.strip_suffix("/readme")) {
return join_mount(mount, parent);
}
// Case-insensitive check for README as last component
let last = name.rsplit('/').next().unwrap_or(&name);
if last.eq_ignore_ascii_case("README") {
let parent = &name[..name.len() - last.len() - 1];
return join_mount(mount, parent);
}
join_mount(mount, &name)
}
/// Derive file name for a non-markdown file in a subgraph (preserves extension).
/// `mount` is the URL prefix; empty string means root-mounted.
fn subgraph_file_name(path: &Path, repo_root: &Path, mount: &str) -> String {
let relative = path.strip_prefix(repo_root).unwrap_or(path);
let name = relative.to_string_lossy().to_string();
join_mount(mount, &name)
}
/// Join a mount prefix with a relative path. Empty mount = no prefix.
fn join_mount(mount: &str, rel: &str) -> String {
if mount.is_empty() {
rel.to_string()
} else {
format!("{}/{}", mount, rel)
}
}
/// Enforce namespace monopoly: remove root pages whose namespace conflicts
/// with a claimed subgraph namespace.
/// Returns list of (evicted_page_id, reason) for reporting.
pub fn enforce_namespace_monopoly(
root_pages: &mut Vec<ParsedPage>,
subgraph_namespaces: &[String],
) -> Vec<(PageId, String)> {
let mut evicted = Vec::new();
root_pages.retain(|page| {
if let Some(ref ns) = page.namespace {
for sg_ns in subgraph_namespaces {
if ns == sg_ns || ns.starts_with(&format!("{}/", sg_ns)) {
evicted.push((
page.id.clone(),
format!(
"namespace '{}' claimed by subgraph '{}'",
ns, sg_ns
),
));
return false;
}
}
}
true
});
evicted
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_subgraph_page_name() {
let repo = PathBuf::from("/git/trident");
// Repo-root README maps to just the subgraph name
assert_eq!(
subgraph_page_name(&PathBuf::from("/git/trident/README.md"), &repo, "trident"),
"trident"
);
// Nested files keep full path
assert_eq!(
subgraph_page_name(
&PathBuf::from("/git/trident/docs/explanation/vision.md"),
&repo,
"trident"
),
"trident/docs/explanation/vision"
);
// Directory README becomes the directory page
assert_eq!(
subgraph_page_name(
&PathBuf::from("/git/trident/src/README.md"),
&repo,
"trident"
),
"trident/src"
);
assert_eq!(
subgraph_page_name(
&PathBuf::from("/git/trident/docs/README.md"),
&repo,
"trident"
),
"trident/docs"
);
}
#[test]
fn test_subgraph_file_name() {
let repo = PathBuf::from("/git/trident");
assert_eq!(
subgraph_file_name(&PathBuf::from("/git/trident/src/main.rs"), &repo, "trident"),
"trident/src/main.rs"
);
assert_eq!(
subgraph_file_name(&PathBuf::from("/git/trident/Cargo.toml"), &repo, "trident"),
"trident/Cargo.toml"
);
}
/// Regression: workspace mode (TOML --subgraphs) must produce subgraph
/// pages everywhere β build, check, and incremental reload. The historical
/// failure was a second code path silently calling the frontmatter
/// discovery, returning zero decls, and dropping all subgraph content.
///
/// This test exercises the public surface end-to-end: a TOML config on
/// disk, load_subgraph_decls, then ingest_subgraph. If either piece
/// regresses, the assertions fail with a clear signal.
#[test]
fn test_load_and_ingest_produces_pages_for_toml_subgraph() {
use std::fs;
use tempfile::TempDir;
let workspace = TempDir::new().unwrap();
let repo = workspace.path().join("mysub");
fs::create_dir_all(repo.join("root")).unwrap();
fs::write(
repo.join("README.md"),
"---\ntags: doc\n---\n# mysub\n\nrepo readme",
).unwrap();
fs::write(
repo.join("root").join("inner.md"),
"---\ntags: doc\n---\n\ninner page",
).unwrap();
fs::write(repo.join("Cargo.toml"), "[package]\nname=\"x\"").unwrap();
let config_path = workspace.path().join("subgraphs.toml");
fs::write(
&config_path,
format!(
"subgraphs\nname = \"mysub\"\npath = {:?}\n",
repo
),
).unwrap();
// load_subgraph_decls with TOML must produce a non-empty list.
// Without it (None), it must produce an empty list β never crash.
let decls = load_subgraph_decls(Some(&config_path)).unwrap();
assert_eq!(decls.len(), 1, "TOML config with one subgraph should load one decl");
assert!(load_subgraph_decls(None).unwrap().is_empty(), "no path β no decls");
// ingest_subgraph must return non-empty pages (md + non-md), proving
// the full pipeline runs. The historical bug bypassed this entirely.
let mut root_pages: Vec<ParsedPage> = vec![];
let ingestion = ingest_subgraph(&decls[0], &mut root_pages).unwrap();
assert!(
ingestion.stats.page_count >= 2,
"expected at least README + inner page, got {} markdown pages",
ingestion.stats.page_count
);
assert!(
ingestion.stats.file_count >= 1,
"expected Cargo.toml as a non-md preview page, got {} file pages",
ingestion.stats.file_count
);
assert!(
!ingestion.pages.is_empty(),
"ingest_subgraph must produce pages; the bug was zero pages slipping through silently"
);
// Pages should carry the subgraph attribution so downstream filters
// (private subgraph filtering, badge rendering) work.
assert!(
ingestion.pages.iter().all(|p| p.subgraph.as_deref() == Some("mysub")),
"every ingested page must be tagged with the subgraph name"
);
}
#[test]
fn test_namespace_monopoly_evicts_matching() {
use crate::parser::{PageKind, PageMeta};
use std::collections::HashMap;
let make = |id: &str, ns: Option<&str>| ParsedPage {
id: id.to_string(),
meta: PageMeta {
title: id.to_string(),
properties: HashMap::new(),
tags: vec![],
public: Some(true),
aliases: vec![],
date: None,
icon: None,
menu_order: None,
stake: None,
},
kind: PageKind::Page,
source_path: PathBuf::new(),
namespace: ns.map(|s| s.to_string()),
subgraph: None,
content_md: String::new(),
outgoing_links: vec![],
};
let mut pages = vec![
make("root-page", None),
make("trident-thesis", None), // root level, no namespace β should NOT be evicted
make("trident-sub-thing", Some("trident")), // namespace = trident β EVICTED
make("other-ns-page", Some("cyber")),
];
let evicted = enforce_namespace_monopoly(&mut pages, &["trident".to_string()]);
assert_eq!(pages.len(), 3);
assert_eq!(evicted.len(), 1);
assert_eq!(evicted[0].0, "trident-sub-thing");
// root-level pages with no namespace stay
assert!(pages.iter().any(|p| p.id == "trident-thesis"));
}
}
// ---
// tags: optica, rust
// crystal-type: source
// crystal-domain: comp
// ---
use crate;
use crate;
use Result;
use ;
use HashMap;
use ;
use SystemTime;
use WalkDir;
/// Per-subgraph parse cache: path β (mtime, parsed page).
/// Re-use cached entries when mtime matches to avoid re-parsing unchanged files.
pub type SubgraphParseCache = ;
/// Declaration of an external repository to include as a subgraph.
/// Default exclude patterns applied to all subgraphs.
pub const DEFAULT_EXCLUDES: & = &;
/// Load subgraph declarations from a TOML config file. With no path, returns
/// an empty list β optica acts on a single repo with no embedded subgraphs.
///
/// One call site for decl loading prevents the regression class where a new
/// code path forgets workspace mode and silently drops every subgraph.
/// Stats reported by `ingest_subgraph` for one decl. Callers print or aggregate.
/// Result of ingesting one subgraph: all new pages it contributes (md + non-md)
/// plus a stats summary. The declaring page (if present in `root_pages`) is
/// removed in place so the README produced by this ingestion takes its slot.
/// Scan, parse, and merge a single subgraph. Encapsulates every step that used
/// to be duplicated across build, check, watch warm-up, and incremental rebuild:
///
/// 1. WalkDir over the subgraph repo, classifying files
/// 2. Parse markdown pages; non-md files become code-fence preview pages
/// 3. If a page's id matches the decl's `declaring_page_id`, copy the root
/// declaring page's metadata + outgoing links and prepend its body
/// 4. Remove the declaring page from `root_pages` so its slot is freed
///
/// One canonical implementation β every caller routes through here.
/// Cached variant of `ingest_subgraph`: re-parses only files whose mtime
/// changed since the last ingest. For large subgraphs (cyb: 5k files) this
/// reduces re-ingest cost from ~30s to <1s on single-file edits.
/// Resolve the graph directory inside a subgraph repo, using the same
/// fallback chain as the main scanner: root β graph β pages β repo root.
/// Scan an external repository and return discovered files under the subgraph namespace.
/// All files are collected; markdown files become Pages, everything else becomes Files.
/// Derive page name for a markdown file in a subgraph.
/// README.md at any level becomes the directory's page.
/// `mount` is the URL prefix for this subgraph; empty string means root-mounted.
///
/// e.g., mount="trident", ~/git/trident/README.md β "trident"
/// e.g., mount="trident", ~/git/trident/docs/README.md β "trident/docs"
/// e.g., mount="", ~/git/crystal/README.md β "" (root page)
/// e.g., mount="", ~/git/crystal/cyber/README.md β "cyber"
/// e.g., mount="w/zoya", ~/git/zoya/README.md β "w/zoya"
/// Derive file name for a non-markdown file in a subgraph (preserves extension).
/// `mount` is the URL prefix; empty string means root-mounted.
/// Join a mount prefix with a relative path. Empty mount = no prefix.
/// Enforce namespace monopoly: remove root pages whose namespace conflicts
/// with a claimed subgraph namespace.
/// Returns list of (evicted_page_id, reason) for reporting.