diff --git a/server/src/chat/transport/matrix/bot/format.rs b/server/src/chat/transport/matrix/bot/format.rs index fd8f0b60..fa2c87e5 100644 --- a/server/src/chat/transport/matrix/bot/format.rs +++ b/server/src/chat/transport/matrix/bot/format.rs @@ -14,11 +14,12 @@ pub fn format_startup_announcement(bot_name: &str) -> String { /// tasklists) so that common Markdown constructs render correctly in Matrix /// clients such as Element. pub fn markdown_to_html(markdown: &str) -> String { + let normalized = crate::chat::util::normalize_line_breaks(markdown); let options = Options::ENABLE_TABLES | Options::ENABLE_FOOTNOTES | Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TASKLISTS; - let parser = Parser::new_ext(markdown, options); + let parser = Parser::new_ext(&normalized, options); let mut html_output = String::new(); html::push_html(&mut html_output, parser); html_output @@ -80,6 +81,20 @@ mod tests { ); } + #[test] + fn markdown_to_html_single_newline_prose_becomes_paragraphs() { + // Single newlines between prose sentences should produce separate paragraphs. + let html = markdown_to_html("Sentence one.\nSentence two."); + assert!( + html.contains("

Sentence one.

"), + "expected separate paragraph for first sentence: {html}" + ); + assert!( + html.contains("

Sentence two.

"), + "expected separate paragraph for second sentence: {html}" + ); + } + #[test] fn startup_announcement_uses_bot_name() { assert_eq!(format_startup_announcement("Timmy"), "Timmy is online."); diff --git a/server/src/chat/transport/slack/format.rs b/server/src/chat/transport/slack/format.rs index 6153112d..3ef8ff84 100644 --- a/server/src/chat/transport/slack/format.rs +++ b/server/src/chat/transport/slack/format.rs @@ -6,9 +6,13 @@ /// This function converts common Markdown constructs so messages render /// nicely in Slack instead of showing raw Markdown syntax. pub fn markdown_to_slack(text: &str) -> String { + use crate::chat::util::normalize_line_breaks; use regex::Regex; use std::sync::LazyLock; + let normalized = normalize_line_breaks(text); + let text = normalized.as_str(); + // Regexes are compiled once and reused across calls. static RE_FENCED_BLOCK: LazyLock = LazyLock::new(|| Regex::new(r"(?ms)^```.*?\n(.*?)^```").unwrap()); diff --git a/server/src/chat/transport/whatsapp/format.rs b/server/src/chat/transport/whatsapp/format.rs index bf9fa9f4..5989ded4 100644 --- a/server/src/chat/transport/whatsapp/format.rs +++ b/server/src/chat/transport/whatsapp/format.rs @@ -55,6 +55,9 @@ pub fn chunk_for_whatsapp(text: &str) -> Vec { /// This function converts common Markdown constructs so messages render /// nicely in WhatsApp instead of showing raw Markdown syntax. pub fn markdown_to_whatsapp(text: &str) -> String { + let normalized = crate::chat::util::normalize_line_breaks(text); + let text = normalized.as_str(); + // Regexes are compiled once and reused across calls. static RE_FENCED_BLOCK: LazyLock = LazyLock::new(|| Regex::new(r"(?ms)^```.*?\n(.*?)^```").unwrap()); diff --git a/server/src/chat/util.rs b/server/src/chat/util.rs index 0b9e20c6..2462ffd6 100644 --- a/server/src/chat/util.rs +++ b/server/src/chat/util.rs @@ -97,6 +97,91 @@ pub fn drain_complete_paragraphs(buffer: &mut String) -> Vec { paragraphs } +/// Normalize single newlines between prose lines to double newlines. +/// +/// LLMs sometimes output text with single newlines between sentences, e.g.: +/// ```text +/// Sentence one. +/// Sentence two. +/// ``` +/// +/// In Markdown a single newline is a *soft break* and may render as a space +/// (or nothing), causing sentences to appear joined ("sentence one.Sentence +/// two"). This function converts single newlines between non-empty prose +/// lines into double newlines (paragraph breaks) so they render correctly. +/// +/// Single newlines are **preserved** (not doubled) when either the preceding +/// or following line is a structured Markdown element: +/// - Bullet list items (`- `, `* `, `+ `) +/// - Ordered list items (`1. `, `2. `, …) +/// - ATX headings (`#`, `##`, …) +/// - Table rows (`|`) +/// - Code fence delimiters (`` ``` ``) +/// +/// Content inside fenced code blocks is also preserved verbatim. +pub fn normalize_line_breaks(text: &str) -> String { + fn is_structured_line(line: &str) -> bool { + let trimmed = line.trim_start(); + if trimmed.is_empty() { + return false; + } + if trimmed.starts_with('#') + || trimmed.starts_with("- ") + || trimmed.starts_with("* ") + || trimmed.starts_with("+ ") + || trimmed.starts_with('|') + || trimmed.starts_with("```") + { + return true; + } + // Ordered list: one or more digits followed by ". " + let after_digits = trimmed.trim_start_matches(|c: char| c.is_ascii_digit()); + if !after_digits.is_empty() + && after_digits.starts_with(". ") + && after_digits.len() < trimmed.len() + { + return true; + } + // Horizontal rules: lines made entirely of -, *, or _ (at least 3 chars). + let all_hr_chars = trimmed + .chars() + .all(|c| matches!(c, '-' | '*' | '_' | ' ')); + let hr_char_count = trimmed.chars().filter(|c| !c.is_whitespace()).count(); + all_hr_chars && hr_char_count >= 3 + } + + let lines: Vec<&str> = text.split('\n').collect(); + let mut result: Vec<&str> = Vec::with_capacity(lines.len() * 2); + let mut in_code_fence = false; + + for (i, &line) in lines.iter().enumerate() { + if line.trim_start().starts_with("```") { + in_code_fence = !in_code_fence; + } + + if i == 0 || in_code_fence { + result.push(line); + continue; + } + + let prev_line = lines[i - 1]; + + // Insert a blank separator when both the current and previous lines + // are non-empty prose (not inside a code fence, not structured Markdown). + let should_double = !line.is_empty() + && !prev_line.is_empty() + && !is_structured_line(line) + && !is_structured_line(prev_line); + + if should_double { + result.push(""); + } + result.push(line); + } + + result.join("\n") +} + // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- @@ -312,4 +397,92 @@ mod tests { assert_eq!(all_paragraphs, vec!["First para.", "Second para."]); assert_eq!(buf, "Third."); } + + // -- normalize_line_breaks ----------------------------------------------- + + #[test] + fn normalize_prose_single_newline_becomes_double() { + let input = "Sentence one.\nSentence two."; + let output = normalize_line_breaks(input); + assert_eq!(output, "Sentence one.\n\nSentence two."); + } + + #[test] + fn normalize_existing_double_newline_unchanged() { + let input = "Paragraph one.\n\nParagraph two."; + let output = normalize_line_breaks(input); + assert_eq!(output, "Paragraph one.\n\nParagraph two."); + } + + #[test] + fn normalize_bullet_list_single_newlines_preserved() { + let input = "- item one\n- item two\n- item three"; + let output = normalize_line_breaks(input); + assert_eq!(output, "- item one\n- item two\n- item three"); + } + + #[test] + fn normalize_heading_single_newline_preserved() { + let input = "# My Heading\nSome text below."; + let output = normalize_line_breaks(input); + assert_eq!(output, "# My Heading\nSome text below."); + } + + #[test] + fn normalize_table_rows_single_newlines_preserved() { + let input = "| Col A | Col B |\n| --- | --- |\n| val1 | val2 |"; + let output = normalize_line_breaks(input); + assert_eq!(output, "| Col A | Col B |\n| --- | --- |\n| val1 | val2 |"); + } + + #[test] + fn normalize_code_block_content_preserved_verbatim() { + let input = "```rust\nlet x = 1;\nlet y = 2;\n```"; + let output = normalize_line_breaks(input); + assert_eq!(output, input); + } + + #[test] + fn normalize_code_block_with_blank_line_inside_preserved() { + let input = "```\nfn foo() {\n let x = 1;\n\n let y = 2;\n}\n```"; + let output = normalize_line_breaks(input); + assert_eq!(output, input); + } + + #[test] + fn normalize_mixed_prose_and_code_block() { + let input = "First sentence.\nSecond sentence.\n\n```rust\nlet x = 1;\nlet y = 2;\n```\n\nThird sentence.\nFourth sentence."; + let output = normalize_line_breaks(input); + // Prose sentences before and after the code block get doubled. + // The code block itself is preserved. + assert!(output.contains("First sentence.\n\nSecond sentence."), "prose before code: {output}"); + assert!(output.contains("```rust\nlet x = 1;\nlet y = 2;\n```"), "code block preserved: {output}"); + assert!(output.contains("Third sentence.\n\nFourth sentence."), "prose after code: {output}"); + } + + #[test] + fn normalize_ordered_list_single_newlines_preserved() { + let input = "1. First item\n2. Second item\n3. Third item"; + let output = normalize_line_breaks(input); + assert_eq!(output, "1. First item\n2. Second item\n3. Third item"); + } + + #[test] + fn normalize_empty_string_unchanged() { + assert_eq!(normalize_line_breaks(""), ""); + } + + #[test] + fn normalize_single_line_unchanged() { + assert_eq!(normalize_line_breaks("Hello."), "Hello."); + } + + #[test] + fn normalize_prose_then_bullet_no_extra_blank() { + // When prose is followed by a bullet item, no extra blank is inserted + // because the bullet line is structured. + let input = "Some prose.\n- bullet item"; + let output = normalize_line_breaks(input); + assert_eq!(output, "Some prose.\n- bullet item"); + } }