pub mod admonitions;
pub mod outliner;
pub mod properties;
pub mod wikilinks;
use crate::scanner::{DiscoveredFile, DiscoveredFiles, FileKind};
use anyhow::Result;
use chrono::NaiveDate;
use serde::Serialize;
use std::collections::HashMap;
use std::path::PathBuf;
pub type PageId = String;
#[derive(Debug, Clone, Serialize)]
pub struct PageMeta {
pub title: String,
pub properties: HashMap<String, String>,
pub tags: Vec<String>,
pub public: Option<bool>,
pub aliases: Vec<String>,
pub date: Option<NaiveDate>,
pub icon: Option<String>,
pub menu_order: Option<i32>,
pub stake: Option<u64>,
}
#[derive(Debug, Clone, PartialEq, Serialize)]
pub enum PageKind {
Page,
Journal,
File,
}
#[derive(Debug, Clone)]
pub struct ParsedPage {
pub id: PageId,
pub meta: PageMeta,
pub kind: PageKind,
pub source_path: PathBuf,
pub namespace: Option<String>,
pub subgraph: Option<String>,
pub content_md: String,
pub outgoing_links: Vec<String>,
}
pub fn slugify_page_name(name: &str) -> PageId {
use unicode_normalization::UnicodeNormalization;
let lower = name.nfc().collect::<String>().to_lowercase();
let mut result = String::with_capacity(lower.len());
let mut prev_hyphen = true;
for ch in lower.chars() {
if ch.is_alphanumeric() || ch == '$' || ch == '.' {
result.push(ch);
prev_hyphen = false;
} else if ch == '/' {
if result.ends_with('-') {
result.pop();
}
result.push('/');
prev_hyphen = true; } else if !prev_hyphen {
result.push('-');
prev_hyphen = true;
}
}
let mut slug = result.trim_end_matches('-').to_string();
if slug.len() > 200 {
slug.truncate(200);
slug = slug.trim_end_matches('-').to_string();
}
slug
}
pub fn apply_ipfs_rewrites(
pages: &mut [ParsedPage],
map_path: &std::path::Path,
gateway: &str,
) -> anyhow::Result<usize> {
let raw = std::fs::read_to_string(map_path)
.map_err(|e| anyhow::anyhow!("reading ipfs map {}: {}", map_path.display(), e))?;
let map: std::collections::HashMap<String, String> = serde_json::from_str(&raw)
.map_err(|e| anyhow::anyhow!("parsing ipfs map {}: {}", map_path.display(), e))?;
let gateway = gateway.trim_end_matches('/');
let re = regex::Regex::new(r#"\.\./media/([^\s\)"'\]<>]+)"#).unwrap();
let mut rewrites = 0usize;
for page in pages.iter_mut() {
let rewritten = re
.replace_all(&page.content_md, |caps: ®ex::Captures| {
match map.get(&caps[1]) {
Some(cid) => {
rewrites += 1;
format!("{}/ipfs/{}", gateway, cid)
}
None => caps[0].to_string(),
}
})
.to_string();
if rewritten != page.content_md {
page.content_md = rewritten;
}
}
Ok(rewrites)
}
pub fn resolve_ipfs_map(config: &crate::config::SiteConfig) -> Option<std::path::PathBuf> {
config.media.ipfs_map.clone().or_else(|| {
let default = config.build.input_dir.join("ipfs-cache.json");
default.exists().then_some(default)
})
}
pub fn apply_ipfs_rewrites_for_config(
pages: &mut [ParsedPage],
config: &crate::config::SiteConfig,
) -> anyhow::Result<(usize, Option<std::path::PathBuf>)> {
let Some(map_path) = resolve_ipfs_map(config) else {
return Ok((0, None));
};
let count = apply_ipfs_rewrites(pages, &map_path, &config.media.ipfs_gateway)?;
Ok((count, Some(map_path)))
}
pub fn synthesize_dir_indexes(pages: &mut Vec<ParsedPage>, subgraph_names: &[String]) {
let existing_ids: std::collections::HashSet<String> =
pages.iter().map(|p| p.id.clone()).collect();
let mut seen_dirs: std::collections::HashSet<String> = std::collections::HashSet::new();
for page in pages.iter() {
if let Some(ref ns) = page.namespace {
let mut accumulated = String::new();
for segment in ns.split('/').filter(|s| !s.is_empty()) {
if accumulated.is_empty() {
accumulated = segment.to_string();
} else {
accumulated = format!("{}/{}", accumulated, segment);
}
seen_dirs.insert(accumulated.clone());
}
}
}
for dir_name in &seen_dirs {
let dir_slug = slugify_page_name(dir_name);
if existing_ids.contains(&dir_slug) {
continue;
}
let short_name = dir_name.rsplit('/').next().unwrap_or(dir_name);
let owning_subgraph = subgraph_names
.iter()
.find(|sg| dir_name == *sg || dir_name.starts_with(&format!("{}/", sg)))
.cloned();
pages.push(ParsedPage {
id: dir_slug,
meta: PageMeta {
title: dir_name.clone(),
properties: std::collections::HashMap::new(),
tags: vec![],
public: Some(true),
aliases: vec![],
date: None,
icon: None,
menu_order: None,
stake: None,
},
kind: PageKind::Page,
source_path: std::path::PathBuf::new(),
namespace: dir_name.rsplitn(2, '/').nth(1).map(|s| s.to_string()),
subgraph: owning_subgraph,
content_md: format!("# {}\n", short_name),
outgoing_links: vec![],
});
}
}
pub fn parse_all(discovered: &DiscoveredFiles) -> Result<Vec<ParsedPage>> {
let mut pages = Vec::new();
for file in &discovered.pages {
let page = parse_file(file)?;
pages.push(page);
}
for file in &discovered.journals {
let page = parse_file(file)?;
pages.push(page);
}
for file in &discovered.files {
let page = parse_non_md_file(file)?;
pages.push(page);
}
Ok(pages)
}
pub fn parse_file(file: &DiscoveredFile) -> Result<ParsedPage> {
let content = std::fs::read_to_string(&file.path)?;
let (meta, content_after_props) = properties::extract_properties(&content, &file.name);
let normalized = if looks_like_outliner(&content_after_props) {
outliner::normalize(&content_after_props)
} else {
content_after_props
};
let normalized = admonitions::transform_admonitions(&normalized);
let outgoing_links = wikilinks::collect_wikilinks(&normalized);
let namespace = extract_namespace(&file.name);
let kind = match file.kind {
FileKind::Journal => PageKind::Journal,
_ => PageKind::Page,
};
let id = slugify_page_name(&file.name);
let is_readme = file.path.file_stem()
.map(|s| s.to_string_lossy().eq_ignore_ascii_case("readme"))
.unwrap_or(false);
let normalized = if file.subgraph.is_some() {
rewrite_relative_links(&normalized, &file.name, is_readme)
} else {
normalized
};
Ok(ParsedPage {
id,
meta,
kind,
source_path: file.path.clone(),
namespace,
subgraph: file.subgraph.clone(),
content_md: normalized,
outgoing_links,
})
}
fn parse_non_md_file(file: &DiscoveredFile) -> Result<ParsedPage> {
let id = slugify_page_name(&file.name);
let namespace = extract_namespace(&file.name);
let ext = file
.path
.extension()
.map(|e| e.to_string_lossy().to_lowercase())
.unwrap_or_default();
let mut tags = Vec::new();
if let Some(lang_tag) = extension_to_tag(&ext) {
tags.push(lang_tag.to_string());
}
if file.subgraph.is_none() {
if let Some(ns) = &namespace {
let top_dir = ns.split('/').next().unwrap_or(ns);
if !top_dir.is_empty() && !tags.contains(&top_dir.to_string()) {
tags.push(top_dir.to_string());
}
}
}
let (content_md, outgoing_links) = match std::fs::read_to_string(&file.path) {
Ok(text) => {
let lang = extension_to_lang(&ext);
let outgoing_links = if file.subgraph.is_none() {
wikilinks::collect_wikilinks(&text)
} else {
Vec::new()
};
let content_md = format!("```{}\n{}\n```", lang, text);
(content_md, outgoing_links)
}
Err(_) => {
let size = std::fs::metadata(&file.path)
.map(|m| format_size(m.len()))
.unwrap_or_else(|_| "unknown size".to_string());
let content_md = format!(
"Binary file: `{}`\n\nSize: {}\nType: {}",
file.name,
size,
if ext.is_empty() {
"unknown"
} else {
&ext
}
);
(content_md, Vec::new())
}
};
Ok(ParsedPage {
id,
meta: PageMeta {
title: file.name.clone(),
properties: HashMap::new(),
tags,
public: Some(true),
aliases: Vec::new(),
date: None,
icon: None,
menu_order: None,
stake: None,
},
kind: PageKind::File,
source_path: file.path.clone(),
namespace,
subgraph: file.subgraph.clone(),
content_md,
outgoing_links,
})
}
fn extract_namespace(name: &str) -> Option<String> {
if name.contains('/') {
let parts: Vec<&str> = name.rsplitn(2, '/').collect();
if parts.len() == 2 {
Some(parts[1].to_string())
} else {
None
}
} else {
None
}
}
fn extension_to_lang(ext: &str) -> &'static str {
match ext {
"rs" => "rust",
"nu" => "nu",
"py" => "python",
"js" => "javascript",
"ts" => "typescript",
"jsx" => "jsx",
"tsx" => "tsx",
"css" => "css",
"html" | "htm" => "html",
"json" => "json",
"toml" => "toml",
"yaml" | "yml" => "yaml",
"sh" | "bash" | "zsh" => "bash",
"sql" => "sql",
"go" => "go",
"c" | "h" => "c",
"cpp" | "hpp" | "cc" => "cpp",
"java" => "java",
"rb" => "ruby",
"lua" => "lua",
"zig" => "zig",
"nix" => "nix",
"md" | "markdown" => "markdown",
"xml" => "xml",
"csv" => "csv",
"txt" => "text",
"edn" => "clojure",
"gitignore" => "gitignore",
_ => "",
}
}
fn extension_to_tag(ext: &str) -> Option<&'static str> {
match ext {
"rs" => Some("rust"),
"nu" => Some("nushell"),
"py" => Some("python"),
"js" => Some("javascript"),
"ts" => Some("typescript"),
"jsx" | "tsx" => Some("react"),
"css" => Some("css"),
"html" | "htm" => Some("html"),
"json" => Some("json"),
"toml" => Some("toml"),
"yaml" | "yml" => Some("yaml"),
"sh" | "bash" | "zsh" => Some("shell"),
"sql" => Some("sql"),
"go" => Some("go"),
"c" | "h" => Some("c"),
"cpp" | "hpp" | "cc" => Some("cpp"),
"java" => Some("java"),
"rb" => Some("ruby"),
"lua" => Some("lua"),
"zig" => Some("zig"),
"nix" => Some("nix"),
"xml" => Some("xml"),
"md" | "markdown" => Some("markdown"),
"zip" | "tar" | "gz" | "bz2" | "xz" => Some("archive"),
"png" | "jpg" | "jpeg" | "gif" | "webp" | "svg" | "ico" => Some("image"),
"mp4" | "mov" | "webm" | "avi" => Some("video"),
"mp3" | "wav" | "ogg" | "flac" => Some("audio"),
"pdf" => Some("pdf"),
"woff" | "woff2" | "ttf" | "otf" => Some("font"),
"ipynb" => Some("jupyter"),
_ => None,
}
}
fn format_size(bytes: u64) -> String {
if bytes < 1024 {
format!("{} B", bytes)
} else if bytes < 1024 * 1024 {
format!("{:.1} KB", bytes as f64 / 1024.0)
} else if bytes < 1024 * 1024 * 1024 {
format!("{:.1} MB", bytes as f64 / (1024.0 * 1024.0))
} else {
format!("{:.1} GB", bytes as f64 / (1024.0 * 1024.0 * 1024.0))
}
}
fn looks_like_outliner(content: &str) -> bool {
let non_empty: Vec<&str> = content
.lines()
.filter(|l| !l.trim().is_empty())
.take(20)
.collect();
if non_empty.is_empty() {
return false;
}
let bullet_count = non_empty
.iter()
.filter(|l| {
let trimmed = l.trim_start();
trimmed.starts_with("- ")
})
.count();
(bullet_count as f64 / non_empty.len() as f64) > 0.5
}
fn is_media_extension(path: &str) -> bool {
let ext = path.rsplit('.').next().unwrap_or("").to_lowercase();
matches!(
ext.as_str(),
"png" | "jpg" | "jpeg" | "gif" | "svg" | "webp" | "ico" | "bmp" | "avif"
| "mp4" | "webm" | "ogg" | "mp3" | "wav" | "flac"
| "pdf" | "zip" | "tar" | "gz" | "woff" | "woff2" | "ttf" | "eot"
)
}
fn resolve_relative_url<'a>(url: &'a str, base: &str) -> String {
if url.starts_with("../") {
let mut parts: Vec<&str> = base.split('/').collect();
let mut rel = url;
while let Some(rest) = rel.strip_prefix("../") {
parts.pop();
rel = rest;
}
if parts.is_empty() {
rel.to_string()
} else {
format!("{}/{}", parts.join("/"), rel)
}
} else {
format!("{}/{}", base, url)
}
}
fn rewrite_relative_links(content: &str, page_name: &str, is_readme: bool) -> String {
use regex::Regex;
lazy_static::lazy_static! {
static ref MD_LINK: Regex = Regex::new(
r"(^|[^!])\[([^\]]*)\]\(([^)]+)\)"
).unwrap();
static ref MD_IMG: Regex = Regex::new(
r"!\[([^\]]*)\]\(([^)]+)\)"
).unwrap();
static ref HTML_ATTR: Regex = Regex::new(
r#"((?:src|href)\s*=\s*")([^"]+)(")"#
).unwrap();
}
let base = if is_readme {
page_name
} else if let Some(pos) = page_name.rfind('/') {
&page_name[..pos]
} else {
page_name
};
let subgraph_name = page_name.split('/').next().unwrap_or(page_name);
let strip_graph_dir = |resolved: &str| -> String {
for d in ["root", "graph", "pages"] {
let prefix = format!("{}/{}/", subgraph_name, d);
if let Some(rest) = resolved.strip_prefix(&prefix) {
return format!("{}/{}", subgraph_name, rest);
}
}
resolved.to_string()
};
let is_external = |url: &str| -> bool {
url.starts_with("http://")
|| url.starts_with("https://")
|| url.starts_with('#')
|| url.starts_with('/')
|| url.starts_with("data:")
|| url.starts_with("mailto:")
};
let content = MD_IMG.replace_all(&content, |caps: ®ex::Captures| {
let alt = &caps[1];
let url = &caps[2];
if is_external(url) {
return caps[0].to_string();
}
let resolved = resolve_relative_url(url, base);
let repo_relative = resolved
.strip_prefix(&format!("{}/", subgraph_name))
.unwrap_or(&resolved);
format!("", alt, subgraph_name, repo_relative)
});
let content = MD_LINK.replace_all(&content, |caps: ®ex::Captures| {
let prefix = &caps[1];
let text = &caps[2];
let raw_url = &caps[3];
if is_external(raw_url) {
return caps[0].to_string();
}
let (url, fragment): (&str, &str) = match raw_url.find('#') {
Some(pos) => (&raw_url[..pos], &raw_url[pos..]),
None => (raw_url, ""),
};
if url.is_empty() {
return caps[0].to_string();
}
let resolved = resolve_relative_url(url, base);
if is_media_extension(&resolved) {
let repo_relative = resolved
.strip_prefix(&format!("{}/", subgraph_name))
.unwrap_or(&resolved);
return format!("{}[{}](/media/{}/{}{})", prefix, text, subgraph_name, repo_relative, fragment);
}
let resolved = strip_graph_dir(&resolved);
let resolved = resolved
.strip_suffix(".md")
.or_else(|| resolved.strip_suffix(".markdown"))
.unwrap_or(&resolved)
.to_string();
let resolved = resolved
.strip_suffix("/index")
.unwrap_or(&resolved)
.trim_end_matches('/')
.to_string();
let slug = slugify_page_name(&resolved);
format!("{}[{}](/{slug}{})", prefix, text, fragment)
});
let content = HTML_ATTR.replace_all(&content, |caps: ®ex::Captures| {
let attr_prefix = &caps[1]; let raw_url = &caps[2];
let quote_end = &caps[3]; if is_external(raw_url) {
return caps[0].to_string();
}
let (url, fragment): (&str, &str) = match raw_url.find('#') {
Some(pos) => (&raw_url[..pos], &raw_url[pos..]),
None => (raw_url, ""),
};
if url.is_empty() {
return caps[0].to_string();
}
let resolved = resolve_relative_url(url, base);
let repo_relative = resolved
.strip_prefix(&format!("{}/", subgraph_name))
.unwrap_or(&resolved);
if is_media_extension(url) {
format!("{}/media/{}/{}{}{}", attr_prefix, subgraph_name, repo_relative, fragment, quote_end)
} else {
let resolved = strip_graph_dir(&resolved);
let resolved = resolved
.strip_suffix(".md")
.or_else(|| resolved.strip_suffix(".markdown"))
.unwrap_or(&resolved)
.to_string();
let slug = slugify_page_name(&resolved);
format!("{}/{}{}{}", attr_prefix, slug, fragment, quote_end)
}
});
content.to_string()
}
pub fn demote_headings(md: &str) -> String {
let mut out = String::with_capacity(md.len() + 32);
let mut in_fence = false;
let mut fence_char: Option<u8> = None;
for line in md.split_inclusive('\n') {
let trimmed = line.trim_start();
let fence_run = trimmed
.as_bytes()
.iter()
.take_while(|&&b| b == b'`' || b == b'~')
.copied()
.collect::<Vec<u8>>();
if fence_run.len() >= 3 && fence_run.iter().all(|&b| b == fence_run[0]) {
if !in_fence {
in_fence = true;
fence_char = Some(fence_run[0]);
} else if fence_char == Some(fence_run[0]) {
in_fence = false;
fence_char = None;
}
out.push_str(line);
continue;
}
if !in_fence {
let bytes = trimmed.as_bytes();
let hashes = bytes.iter().take_while(|&&b| b == b'#').count();
if (1..=5).contains(&hashes)
&& (bytes.get(hashes) == Some(&b' ') || bytes.get(hashes) == Some(&b'\t'))
{
let leading_ws = line.len() - trimmed.len();
out.push_str(&line[..leading_ws]);
out.push('#');
out.push_str(trimmed);
continue;
}
}
out.push_str(line);
}
out
}
pub fn merge_subgraph_content(root_md: &str, subgraph_name: &str, readme_md: &str) -> String {
let mut out = String::with_capacity(root_md.len() + readme_md.len() + 64);
out.push_str(root_md);
out.push_str(&format!("\n\n---\n\n# from subgraph {}\n\n", subgraph_name));
out.push_str(&demote_headings(readme_md));
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_slugify() {
assert_eq!(
slugify_page_name("Collective Focus Theorem"),
"collective-focus-theorem"
);
assert_eq!(
slugify_page_name("projects/Cyber Valley"),
"projects/cyber-valley"
);
assert_eq!(slugify_page_name("2025-02-08"), "2025-02-08");
assert_eq!(slugify_page_name("$BOOT"), "$boot");
assert_eq!(slugify_page_name("$PUSSY on $SOL"), "$pussy-on-$sol");
assert_eq!(slugify_page_name(".moon names"), ".moon-names");
let nfc = "G\u{00F6}del prison"; let nfd = "Go\u{0308}del prison"; assert_eq!(
slugify_page_name(nfc),
slugify_page_name(nfd),
"NFC and NFD slugs must match"
);
}
#[test]
fn test_extension_to_lang() {
assert_eq!(extension_to_lang("rs"), "rust");
assert_eq!(extension_to_lang("nu"), "nu");
assert_eq!(extension_to_lang("py"), "python");
assert_eq!(extension_to_lang("unknown_ext"), "");
}
#[test]
fn test_format_size() {
assert_eq!(format_size(512), "512 B");
assert_eq!(format_size(1536), "1.5 KB");
assert_eq!(format_size(2 * 1024 * 1024), "2.0 MB");
}
#[test]
fn test_extract_namespace_simple() {
assert_eq!(extract_namespace("cyber/core"), Some("cyber".to_string()));
}
#[test]
fn test_extract_namespace_deep() {
assert_eq!(
extract_namespace("a/b/c/page"),
Some("a/b/c".to_string())
);
}
#[test]
fn test_extract_namespace_root() {
assert_eq!(extract_namespace("page"), None);
assert_eq!(extract_namespace("simple-page"), None);
}
}