Files
huskies/server/src/chat/transport/whatsapp/format.rs
T

252 lines
8.8 KiB
Rust
Raw Normal View History

//! WhatsApp message formatting — Markdown-to-WhatsApp conversion and message chunking.
use regex::Regex;
use std::sync::LazyLock;
/// WhatsApp Business API maximum message body size in characters.
pub(super) const WHATSAPP_MAX_MESSAGE_LEN: usize = 4096;
/// Split a text into chunks that fit within WhatsApp's message size limit.
///
/// Tries to split on paragraph boundaries (`\n\n`), falling back to line
/// boundaries (`\n`), and finally hard-splitting at the character limit.
pub fn chunk_for_whatsapp(text: &str) -> Vec<String> {
if text.len() <= WHATSAPP_MAX_MESSAGE_LEN {
return vec![text.to_string()];
}
let mut chunks = Vec::new();
let mut remaining = text;
while !remaining.is_empty() {
if remaining.len() <= WHATSAPP_MAX_MESSAGE_LEN {
chunks.push(remaining.to_string());
break;
}
// Find the best split point within the limit.
let window = &remaining[..WHATSAPP_MAX_MESSAGE_LEN];
// Prefer paragraph boundary.
let split_pos = window
.rfind("\n\n")
.or_else(|| window.rfind('\n'))
.unwrap_or(WHATSAPP_MAX_MESSAGE_LEN);
let (chunk, rest) = remaining.split_at(split_pos);
let chunk = chunk.trim();
if !chunk.is_empty() {
chunks.push(chunk.to_string());
}
// Skip the delimiter.
remaining = rest.trim_start_matches('\n');
}
chunks
}
/// Convert standard Markdown formatting to WhatsApp-native formatting.
///
/// WhatsApp supports a limited subset of formatting:
/// - Bold: `*text*`
/// - Italic: `_text_`
/// - Strikethrough: `~text~`
/// - Monospace / code: backtick-delimited (same as Markdown)
///
/// This function converts common Markdown constructs so messages render
/// nicely in WhatsApp instead of showing raw Markdown syntax.
pub fn markdown_to_whatsapp(text: &str) -> String {
let normalized = crate::chat::util::normalize_line_breaks(text);
let text = normalized.as_str();
// Regexes are compiled once and reused across calls.
static RE_FENCED_BLOCK: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?ms)^```.*?\n(.*?)^```").unwrap());
static RE_HEADER: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?m)^#{1,6}\s+(.+)$").unwrap());
static RE_BOLD_ITALIC: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\*\*\*(.+?)\*\*\*").unwrap());
static RE_BOLD: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\*\*(.+?)\*\*").unwrap());
static RE_STRIKETHROUGH: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"~~(.+?)~~").unwrap());
static RE_LINK: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
static RE_HR: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?m)^---+$").unwrap());
// 1. Protect fenced code blocks by replacing them with placeholders.
let mut code_blocks: Vec<String> = Vec::new();
let protected = RE_FENCED_BLOCK.replace_all(text, |caps: &regex::Captures| {
let idx = code_blocks.len();
code_blocks.push(caps[0].to_string());
format!("\x00CODEBLOCK{idx}\x00")
});
let mut out = protected.into_owned();
// 2. Headers → bold text.
out = RE_HEADER.replace_all(&out, "*$1*").into_owned();
// 3. Bold+italic (***text***) → bold italic (*_text_*).
out = RE_BOLD_ITALIC.replace_all(&out, "*_${1}_*").into_owned();
// 4. Bold (**text**) → WhatsApp bold (*text*).
out = RE_BOLD.replace_all(&out, "*$1*").into_owned();
// 5. Strikethrough (~~text~~) → WhatsApp strikethrough (~text~).
out = RE_STRIKETHROUGH.replace_all(&out, "~$1~").into_owned();
// 6. Links [text](url) → text (url).
out = RE_LINK.replace_all(&out, "$1 ($2)").into_owned();
// 7. Horizontal rules → empty line (just remove them).
out = RE_HR.replace_all(&out, "").into_owned();
// 8. Restore code blocks.
for (idx, block) in code_blocks.iter().enumerate() {
out = out.replace(&format!("\x00CODEBLOCK{idx}\x00"), block);
}
out
}
// ── Tests ───────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
// ── chunk_for_whatsapp tests ────────────────────────────────────────
#[test]
fn chunk_short_message_returns_single_chunk() {
let chunks = chunk_for_whatsapp("Hello world");
assert_eq!(chunks, vec!["Hello world"]);
}
#[test]
fn chunk_exactly_at_limit_returns_single_chunk() {
let text = "a".repeat(WHATSAPP_MAX_MESSAGE_LEN);
let chunks = chunk_for_whatsapp(&text);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].len(), WHATSAPP_MAX_MESSAGE_LEN);
}
#[test]
fn chunk_splits_on_paragraph_boundary() {
// Create text with a paragraph boundary near the split point.
let first_para = "a".repeat(4000);
let second_para = "b".repeat(200);
let text = format!("{first_para}\n\n{second_para}");
let chunks = chunk_for_whatsapp(&text);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0], first_para);
assert_eq!(chunks[1], second_para);
}
#[test]
fn chunk_splits_on_line_boundary_when_no_paragraph_break() {
let first_line = "a".repeat(4000);
let second_line = "b".repeat(200);
let text = format!("{first_line}\n{second_line}");
let chunks = chunk_for_whatsapp(&text);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0], first_line);
assert_eq!(chunks[1], second_line);
}
#[test]
fn chunk_hard_splits_continuous_text() {
let text = "x".repeat(WHATSAPP_MAX_MESSAGE_LEN * 2 + 100);
let chunks = chunk_for_whatsapp(&text);
assert!(chunks.len() >= 2);
for chunk in &chunks {
assert!(chunk.len() <= WHATSAPP_MAX_MESSAGE_LEN);
}
// Verify all content is preserved.
let reassembled: String = chunks.join("");
assert_eq!(reassembled.len(), text.len());
}
#[test]
fn chunk_empty_string_returns_single_empty() {
let chunks = chunk_for_whatsapp("");
assert_eq!(chunks, vec![""]);
}
// ── markdown_to_whatsapp tests ────────────────────────────────────────
#[test]
fn md_to_wa_converts_headers_to_bold() {
assert_eq!(markdown_to_whatsapp("# Title"), "*Title*");
assert_eq!(markdown_to_whatsapp("## Subtitle"), "*Subtitle*");
assert_eq!(markdown_to_whatsapp("### Section"), "*Section*");
assert_eq!(markdown_to_whatsapp("###### Deep"), "*Deep*");
}
#[test]
fn md_to_wa_converts_bold() {
assert_eq!(markdown_to_whatsapp("**bold text**"), "*bold text*");
}
#[test]
fn md_to_wa_converts_bold_italic() {
assert_eq!(markdown_to_whatsapp("***emphasis***"), "*_emphasis_*");
}
#[test]
fn md_to_wa_converts_strikethrough() {
assert_eq!(markdown_to_whatsapp("~~removed~~"), "~removed~");
}
#[test]
fn md_to_wa_converts_links() {
assert_eq!(
markdown_to_whatsapp("[click here](https://example.com)"),
"click here (https://example.com)"
);
}
#[test]
fn md_to_wa_removes_horizontal_rules() {
assert_eq!(markdown_to_whatsapp("above\n---\nbelow"), "above\n\nbelow");
}
#[test]
fn md_to_wa_preserves_inline_code() {
assert_eq!(markdown_to_whatsapp("use `foo()` here"), "use `foo()` here");
}
#[test]
fn md_to_wa_preserves_code_blocks() {
let input = "before\n```rust\nfn main() {\n println!(\"**not bold**\");\n}\n```\nafter";
let output = markdown_to_whatsapp(input);
// Code block content must NOT be converted.
assert!(output.contains("\"**not bold**\""));
// But surrounding text is still converted.
assert!(output.contains("before"));
assert!(output.contains("after"));
}
#[test]
fn md_to_wa_mixed_message() {
let input = "### Philosophy\n- **Stories** define the change\n- ~~old~~ is gone\n- See [docs](https://example.com)";
let output = markdown_to_whatsapp(input);
assert!(output.starts_with("*Philosophy*"));
assert!(output.contains("*Stories*"));
assert!(output.contains("~old~"));
assert!(output.contains("docs (https://example.com)"));
}
#[test]
fn md_to_wa_passthrough_plain_text() {
let plain = "Hello, how are you?";
assert_eq!(markdown_to_whatsapp(plain), plain);
}
#[test]
fn md_to_wa_empty_string() {
assert_eq!(markdown_to_whatsapp(""), "");
}
}