2026-04-12 13:11:23 +00:00
|
|
|
//! WhatsApp message formatting — Markdown-to-WhatsApp conversion and message chunking.
|
2026-03-27 10:49:39 +00:00
|
|
|
use regex::Regex;
|
|
|
|
|
use std::sync::LazyLock;
|
|
|
|
|
|
|
|
|
|
/// WhatsApp Business API maximum message body size in characters.
|
|
|
|
|
pub(super) const WHATSAPP_MAX_MESSAGE_LEN: usize = 4096;
|
|
|
|
|
|
|
|
|
|
/// Split a text into chunks that fit within WhatsApp's message size limit.
|
|
|
|
|
///
|
|
|
|
|
/// Tries to split on paragraph boundaries (`\n\n`), falling back to line
|
|
|
|
|
/// boundaries (`\n`), and finally hard-splitting at the character limit.
|
|
|
|
|
pub fn chunk_for_whatsapp(text: &str) -> Vec<String> {
|
|
|
|
|
if text.len() <= WHATSAPP_MAX_MESSAGE_LEN {
|
|
|
|
|
return vec![text.to_string()];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let mut chunks = Vec::new();
|
|
|
|
|
let mut remaining = text;
|
|
|
|
|
|
|
|
|
|
while !remaining.is_empty() {
|
|
|
|
|
if remaining.len() <= WHATSAPP_MAX_MESSAGE_LEN {
|
|
|
|
|
chunks.push(remaining.to_string());
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Find the best split point within the limit.
|
|
|
|
|
let window = &remaining[..WHATSAPP_MAX_MESSAGE_LEN];
|
|
|
|
|
|
|
|
|
|
// Prefer paragraph boundary.
|
|
|
|
|
let split_pos = window
|
|
|
|
|
.rfind("\n\n")
|
|
|
|
|
.or_else(|| window.rfind('\n'))
|
|
|
|
|
.unwrap_or(WHATSAPP_MAX_MESSAGE_LEN);
|
|
|
|
|
|
|
|
|
|
let (chunk, rest) = remaining.split_at(split_pos);
|
|
|
|
|
let chunk = chunk.trim();
|
|
|
|
|
if !chunk.is_empty() {
|
|
|
|
|
chunks.push(chunk.to_string());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Skip the delimiter.
|
|
|
|
|
remaining = rest.trim_start_matches('\n');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
chunks
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Convert standard Markdown formatting to WhatsApp-native formatting.
|
|
|
|
|
///
|
|
|
|
|
/// WhatsApp supports a limited subset of formatting:
|
|
|
|
|
/// - Bold: `*text*`
|
|
|
|
|
/// - Italic: `_text_`
|
|
|
|
|
/// - Strikethrough: `~text~`
|
|
|
|
|
/// - Monospace / code: backtick-delimited (same as Markdown)
|
|
|
|
|
///
|
|
|
|
|
/// This function converts common Markdown constructs so messages render
|
|
|
|
|
/// nicely in WhatsApp instead of showing raw Markdown syntax.
|
|
|
|
|
pub fn markdown_to_whatsapp(text: &str) -> String {
|
2026-03-28 10:39:13 +00:00
|
|
|
let normalized = crate::chat::util::normalize_line_breaks(text);
|
|
|
|
|
let text = normalized.as_str();
|
|
|
|
|
|
2026-03-27 10:49:39 +00:00
|
|
|
// Regexes are compiled once and reused across calls.
|
|
|
|
|
static RE_FENCED_BLOCK: LazyLock<Regex> =
|
|
|
|
|
LazyLock::new(|| Regex::new(r"(?ms)^```.*?\n(.*?)^```").unwrap());
|
|
|
|
|
static RE_HEADER: LazyLock<Regex> =
|
|
|
|
|
LazyLock::new(|| Regex::new(r"(?m)^#{1,6}\s+(.+)$").unwrap());
|
|
|
|
|
static RE_BOLD_ITALIC: LazyLock<Regex> =
|
|
|
|
|
LazyLock::new(|| Regex::new(r"\*\*\*(.+?)\*\*\*").unwrap());
|
2026-04-13 14:07:08 +00:00
|
|
|
static RE_BOLD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*\*(.+?)\*\*").unwrap());
|
|
|
|
|
static RE_STRIKETHROUGH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"~~(.+?)~~").unwrap());
|
2026-03-27 10:49:39 +00:00
|
|
|
static RE_LINK: LazyLock<Regex> =
|
|
|
|
|
LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
|
2026-04-13 14:07:08 +00:00
|
|
|
static RE_HR: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^---+$").unwrap());
|
2026-03-27 10:49:39 +00:00
|
|
|
|
|
|
|
|
// 1. Protect fenced code blocks by replacing them with placeholders.
|
|
|
|
|
let mut code_blocks: Vec<String> = Vec::new();
|
|
|
|
|
let protected = RE_FENCED_BLOCK.replace_all(text, |caps: ®ex::Captures| {
|
|
|
|
|
let idx = code_blocks.len();
|
|
|
|
|
code_blocks.push(caps[0].to_string());
|
|
|
|
|
format!("\x00CODEBLOCK{idx}\x00")
|
|
|
|
|
});
|
|
|
|
|
let mut out = protected.into_owned();
|
|
|
|
|
|
|
|
|
|
// 2. Headers → bold text.
|
|
|
|
|
out = RE_HEADER.replace_all(&out, "*$1*").into_owned();
|
|
|
|
|
|
|
|
|
|
// 3. Bold+italic (***text***) → bold italic (*_text_*).
|
|
|
|
|
out = RE_BOLD_ITALIC.replace_all(&out, "*_${1}_*").into_owned();
|
|
|
|
|
|
|
|
|
|
// 4. Bold (**text**) → WhatsApp bold (*text*).
|
|
|
|
|
out = RE_BOLD.replace_all(&out, "*$1*").into_owned();
|
|
|
|
|
|
|
|
|
|
// 5. Strikethrough (~~text~~) → WhatsApp strikethrough (~text~).
|
|
|
|
|
out = RE_STRIKETHROUGH.replace_all(&out, "~$1~").into_owned();
|
|
|
|
|
|
|
|
|
|
// 6. Links [text](url) → text (url).
|
|
|
|
|
out = RE_LINK.replace_all(&out, "$1 ($2)").into_owned();
|
|
|
|
|
|
|
|
|
|
// 7. Horizontal rules → empty line (just remove them).
|
|
|
|
|
out = RE_HR.replace_all(&out, "").into_owned();
|
|
|
|
|
|
|
|
|
|
// 8. Restore code blocks.
|
|
|
|
|
for (idx, block) in code_blocks.iter().enumerate() {
|
|
|
|
|
out = out.replace(&format!("\x00CODEBLOCK{idx}\x00"), block);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Tests ───────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod tests {
|
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
|
|
// ── chunk_for_whatsapp tests ────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn chunk_short_message_returns_single_chunk() {
|
|
|
|
|
let chunks = chunk_for_whatsapp("Hello world");
|
|
|
|
|
assert_eq!(chunks, vec!["Hello world"]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn chunk_exactly_at_limit_returns_single_chunk() {
|
|
|
|
|
let text = "a".repeat(WHATSAPP_MAX_MESSAGE_LEN);
|
|
|
|
|
let chunks = chunk_for_whatsapp(&text);
|
|
|
|
|
assert_eq!(chunks.len(), 1);
|
|
|
|
|
assert_eq!(chunks[0].len(), WHATSAPP_MAX_MESSAGE_LEN);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn chunk_splits_on_paragraph_boundary() {
|
|
|
|
|
// Create text with a paragraph boundary near the split point.
|
|
|
|
|
let first_para = "a".repeat(4000);
|
|
|
|
|
let second_para = "b".repeat(200);
|
|
|
|
|
let text = format!("{first_para}\n\n{second_para}");
|
|
|
|
|
let chunks = chunk_for_whatsapp(&text);
|
|
|
|
|
assert_eq!(chunks.len(), 2);
|
|
|
|
|
assert_eq!(chunks[0], first_para);
|
|
|
|
|
assert_eq!(chunks[1], second_para);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn chunk_splits_on_line_boundary_when_no_paragraph_break() {
|
|
|
|
|
let first_line = "a".repeat(4000);
|
|
|
|
|
let second_line = "b".repeat(200);
|
|
|
|
|
let text = format!("{first_line}\n{second_line}");
|
|
|
|
|
let chunks = chunk_for_whatsapp(&text);
|
|
|
|
|
assert_eq!(chunks.len(), 2);
|
|
|
|
|
assert_eq!(chunks[0], first_line);
|
|
|
|
|
assert_eq!(chunks[1], second_line);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn chunk_hard_splits_continuous_text() {
|
|
|
|
|
let text = "x".repeat(WHATSAPP_MAX_MESSAGE_LEN * 2 + 100);
|
|
|
|
|
let chunks = chunk_for_whatsapp(&text);
|
|
|
|
|
assert!(chunks.len() >= 2);
|
|
|
|
|
for chunk in &chunks {
|
|
|
|
|
assert!(chunk.len() <= WHATSAPP_MAX_MESSAGE_LEN);
|
|
|
|
|
}
|
|
|
|
|
// Verify all content is preserved.
|
|
|
|
|
let reassembled: String = chunks.join("");
|
|
|
|
|
assert_eq!(reassembled.len(), text.len());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn chunk_empty_string_returns_single_empty() {
|
|
|
|
|
let chunks = chunk_for_whatsapp("");
|
|
|
|
|
assert_eq!(chunks, vec![""]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── markdown_to_whatsapp tests ────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn md_to_wa_converts_headers_to_bold() {
|
|
|
|
|
assert_eq!(markdown_to_whatsapp("# Title"), "*Title*");
|
|
|
|
|
assert_eq!(markdown_to_whatsapp("## Subtitle"), "*Subtitle*");
|
|
|
|
|
assert_eq!(markdown_to_whatsapp("### Section"), "*Section*");
|
|
|
|
|
assert_eq!(markdown_to_whatsapp("###### Deep"), "*Deep*");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn md_to_wa_converts_bold() {
|
|
|
|
|
assert_eq!(markdown_to_whatsapp("**bold text**"), "*bold text*");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn md_to_wa_converts_bold_italic() {
|
|
|
|
|
assert_eq!(markdown_to_whatsapp("***emphasis***"), "*_emphasis_*");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn md_to_wa_converts_strikethrough() {
|
|
|
|
|
assert_eq!(markdown_to_whatsapp("~~removed~~"), "~removed~");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn md_to_wa_converts_links() {
|
|
|
|
|
assert_eq!(
|
|
|
|
|
markdown_to_whatsapp("[click here](https://example.com)"),
|
|
|
|
|
"click here (https://example.com)"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn md_to_wa_removes_horizontal_rules() {
|
|
|
|
|
assert_eq!(markdown_to_whatsapp("above\n---\nbelow"), "above\n\nbelow");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn md_to_wa_preserves_inline_code() {
|
|
|
|
|
assert_eq!(markdown_to_whatsapp("use `foo()` here"), "use `foo()` here");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn md_to_wa_preserves_code_blocks() {
|
|
|
|
|
let input = "before\n```rust\nfn main() {\n println!(\"**not bold**\");\n}\n```\nafter";
|
|
|
|
|
let output = markdown_to_whatsapp(input);
|
|
|
|
|
// Code block content must NOT be converted.
|
|
|
|
|
assert!(output.contains("\"**not bold**\""));
|
|
|
|
|
// But surrounding text is still converted.
|
|
|
|
|
assert!(output.contains("before"));
|
|
|
|
|
assert!(output.contains("after"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn md_to_wa_mixed_message() {
|
|
|
|
|
let input = "### Philosophy\n- **Stories** define the change\n- ~~old~~ is gone\n- See [docs](https://example.com)";
|
|
|
|
|
let output = markdown_to_whatsapp(input);
|
|
|
|
|
assert!(output.starts_with("*Philosophy*"));
|
|
|
|
|
assert!(output.contains("*Stories*"));
|
|
|
|
|
assert!(output.contains("~old~"));
|
|
|
|
|
assert!(output.contains("docs (https://example.com)"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn md_to_wa_passthrough_plain_text() {
|
|
|
|
|
let plain = "Hello, how are you?";
|
|
|
|
|
assert_eq!(markdown_to_whatsapp(plain), plain);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn md_to_wa_empty_string() {
|
|
|
|
|
assert_eq!(markdown_to_whatsapp(""), "");
|
|
|
|
|
}
|
|
|
|
|
}
|