storkit: merge 427_story_server_side_text_normalization_for_chat_message_line_breaks

2026-03-28 10:39:13 +00:00
parent 1ae2fa9b9b
commit 52513b55ff
4 changed files with 196 additions and 1 deletions
@@ -14,11 +14,12 @@ pub fn format_startup_announcement(bot_name: &str) -> String {
 /// tasklists) so that common Markdown constructs render correctly in Matrix
 /// clients such as Element.
 pub fn markdown_to_html(markdown: &str) -> String {
+    let normalized = crate::chat::util::normalize_line_breaks(markdown);
    let options = Options::ENABLE_TABLES
        | Options::ENABLE_FOOTNOTES
        | Options::ENABLE_STRIKETHROUGH
        | Options::ENABLE_TASKLISTS;
-    let parser = Parser::new_ext(markdown, options);
+    let parser = Parser::new_ext(&normalized, options);
    let mut html_output = String::new();
    html::push_html(&mut html_output, parser);
    html_output
@@ -80,6 +81,20 @@ mod tests {
        );
    }

+    #[test]
+    fn markdown_to_html_single_newline_prose_becomes_paragraphs() {
+        // Single newlines between prose sentences should produce separate paragraphs.
+        let html = markdown_to_html("Sentence one.\nSentence two.");
+        assert!(
+            html.contains("<p>Sentence one.</p>"),
+            "expected separate paragraph for first sentence: {html}"
+        );
+        assert!(
+            html.contains("<p>Sentence two.</p>"),
+            "expected separate paragraph for second sentence: {html}"
+        );
+    }
+
    #[test]
    fn startup_announcement_uses_bot_name() {
        assert_eq!(format_startup_announcement("Timmy"), "Timmy is online.");
@@ -6,9 +6,13 @@
 /// This function converts common Markdown constructs so messages render
 /// nicely in Slack instead of showing raw Markdown syntax.
 pub fn markdown_to_slack(text: &str) -> String {
+    use crate::chat::util::normalize_line_breaks;
    use regex::Regex;
    use std::sync::LazyLock;

+    let normalized = normalize_line_breaks(text);
+    let text = normalized.as_str();
+
    // Regexes are compiled once and reused across calls.
    static RE_FENCED_BLOCK: LazyLock<Regex> =
        LazyLock::new(|| Regex::new(r"(?ms)^```.*?\n(.*?)^```").unwrap());
@@ -55,6 +55,9 @@ pub fn chunk_for_whatsapp(text: &str) -> Vec<String> {
 /// This function converts common Markdown constructs so messages render
 /// nicely in WhatsApp instead of showing raw Markdown syntax.
 pub fn markdown_to_whatsapp(text: &str) -> String {
+    let normalized = crate::chat::util::normalize_line_breaks(text);
+    let text = normalized.as_str();
+
    // Regexes are compiled once and reused across calls.
    static RE_FENCED_BLOCK: LazyLock<Regex> =
        LazyLock::new(|| Regex::new(r"(?ms)^```.*?\n(.*?)^```").unwrap());
@@ -97,6 +97,91 @@ pub fn drain_complete_paragraphs(buffer: &mut String) -> Vec<String> {
    paragraphs
 }

+/// Normalize single newlines between prose lines to double newlines.
+///
+/// LLMs sometimes output text with single newlines between sentences, e.g.:
+/// ```text
+/// Sentence one.
+/// Sentence two.
+/// ```
+///
+/// In Markdown a single newline is a *soft break* and may render as a space
+/// (or nothing), causing sentences to appear joined ("sentence one.Sentence
+/// two").  This function converts single newlines between non-empty prose
+/// lines into double newlines (paragraph breaks) so they render correctly.
+///
+/// Single newlines are **preserved** (not doubled) when either the preceding
+/// or following line is a structured Markdown element:
+/// - Bullet list items (`- `, `* `, `+ `)
+/// - Ordered list items (`1. `, `2. `, …)
+/// - ATX headings (`#`, `##`, …)
+/// - Table rows (`|`)
+/// - Code fence delimiters (`` ``` ``)
+///
+/// Content inside fenced code blocks is also preserved verbatim.
+pub fn normalize_line_breaks(text: &str) -> String {
+    fn is_structured_line(line: &str) -> bool {
+        let trimmed = line.trim_start();
+        if trimmed.is_empty() {
+            return false;
+        }
+        if trimmed.starts_with('#')
+            || trimmed.starts_with("- ")
+            || trimmed.starts_with("* ")
+            || trimmed.starts_with("+ ")
+            || trimmed.starts_with('|')
+            || trimmed.starts_with("```")
+        {
+            return true;
+        }
+        // Ordered list: one or more digits followed by ". "
+        let after_digits = trimmed.trim_start_matches(|c: char| c.is_ascii_digit());
+        if !after_digits.is_empty()
+            && after_digits.starts_with(". ")
+            && after_digits.len() < trimmed.len()
+        {
+            return true;
+        }
+        // Horizontal rules: lines made entirely of -, *, or _ (at least 3 chars).
+        let all_hr_chars = trimmed
+            .chars()
+            .all(|c| matches!(c, '-' | '*' | '_' | ' '));
+        let hr_char_count = trimmed.chars().filter(|c| !c.is_whitespace()).count();
+        all_hr_chars && hr_char_count >= 3
+    }
+
+    let lines: Vec<&str> = text.split('\n').collect();
+    let mut result: Vec<&str> = Vec::with_capacity(lines.len() * 2);
+    let mut in_code_fence = false;
+
+    for (i, &line) in lines.iter().enumerate() {
+        if line.trim_start().starts_with("```") {
+            in_code_fence = !in_code_fence;
+        }
+
+        if i == 0 || in_code_fence {
+            result.push(line);
+            continue;
+        }
+
+        let prev_line = lines[i - 1];
+
+        // Insert a blank separator when both the current and previous lines
+        // are non-empty prose (not inside a code fence, not structured Markdown).
+        let should_double = !line.is_empty()
+            && !prev_line.is_empty()
+            && !is_structured_line(line)
+            && !is_structured_line(prev_line);
+
+        if should_double {
+            result.push("");
+        }
+        result.push(line);
+    }
+
+    result.join("\n")
+}
+
 // ---------------------------------------------------------------------------
 // Tests
 // ---------------------------------------------------------------------------
@@ -312,4 +397,92 @@ mod tests {
        assert_eq!(all_paragraphs, vec!["First para.", "Second para."]);
        assert_eq!(buf, "Third.");
    }
+
+    // -- normalize_line_breaks -----------------------------------------------
+
+    #[test]
+    fn normalize_prose_single_newline_becomes_double() {
+        let input = "Sentence one.\nSentence two.";
+        let output = normalize_line_breaks(input);
+        assert_eq!(output, "Sentence one.\n\nSentence two.");
+    }
+
+    #[test]
+    fn normalize_existing_double_newline_unchanged() {
+        let input = "Paragraph one.\n\nParagraph two.";
+        let output = normalize_line_breaks(input);
+        assert_eq!(output, "Paragraph one.\n\nParagraph two.");
+    }
+
+    #[test]
+    fn normalize_bullet_list_single_newlines_preserved() {
+        let input = "- item one\n- item two\n- item three";
+        let output = normalize_line_breaks(input);
+        assert_eq!(output, "- item one\n- item two\n- item three");
+    }
+
+    #[test]
+    fn normalize_heading_single_newline_preserved() {
+        let input = "# My Heading\nSome text below.";
+        let output = normalize_line_breaks(input);
+        assert_eq!(output, "# My Heading\nSome text below.");
+    }
+
+    #[test]
+    fn normalize_table_rows_single_newlines_preserved() {
+        let input = "| Col A | Col B |\n| --- | --- |\n| val1 | val2 |";
+        let output = normalize_line_breaks(input);
+        assert_eq!(output, "| Col A | Col B |\n| --- | --- |\n| val1 | val2 |");
+    }
+
+    #[test]
+    fn normalize_code_block_content_preserved_verbatim() {
+        let input = "```rust\nlet x = 1;\nlet y = 2;\n```";
+        let output = normalize_line_breaks(input);
+        assert_eq!(output, input);
+    }
+
+    #[test]
+    fn normalize_code_block_with_blank_line_inside_preserved() {
+        let input = "```\nfn foo() {\n    let x = 1;\n\n    let y = 2;\n}\n```";
+        let output = normalize_line_breaks(input);
+        assert_eq!(output, input);
+    }
+
+    #[test]
+    fn normalize_mixed_prose_and_code_block() {
+        let input = "First sentence.\nSecond sentence.\n\n```rust\nlet x = 1;\nlet y = 2;\n```\n\nThird sentence.\nFourth sentence.";
+        let output = normalize_line_breaks(input);
+        // Prose sentences before and after the code block get doubled.
+        // The code block itself is preserved.
+        assert!(output.contains("First sentence.\n\nSecond sentence."), "prose before code: {output}");
+        assert!(output.contains("```rust\nlet x = 1;\nlet y = 2;\n```"), "code block preserved: {output}");
+        assert!(output.contains("Third sentence.\n\nFourth sentence."), "prose after code: {output}");
+    }
+
+    #[test]
+    fn normalize_ordered_list_single_newlines_preserved() {
+        let input = "1. First item\n2. Second item\n3. Third item";
+        let output = normalize_line_breaks(input);
+        assert_eq!(output, "1. First item\n2. Second item\n3. Third item");
+    }
+
+    #[test]
+    fn normalize_empty_string_unchanged() {
+        assert_eq!(normalize_line_breaks(""), "");
+    }
+
+    #[test]
+    fn normalize_single_line_unchanged() {
+        assert_eq!(normalize_line_breaks("Hello."), "Hello.");
+    }
+
+    #[test]
+    fn normalize_prose_then_bullet_no_extra_blank() {
+        // When prose is followed by a bullet item, no extra blank is inserted
+        // because the bullet line is structured.
+        let input = "Some prose.\n- bullet item";
+        let output = normalize_line_breaks(input);
+        assert_eq!(output, "Some prose.\n- bullet item");
+    }
 }