storkit: merge 427_story_server_side_text_normalization_for_chat_message_line_breaks
This commit is contained in:
@@ -14,11 +14,12 @@ pub fn format_startup_announcement(bot_name: &str) -> String {
|
||||
/// tasklists) so that common Markdown constructs render correctly in Matrix
|
||||
/// clients such as Element.
|
||||
pub fn markdown_to_html(markdown: &str) -> String {
|
||||
let normalized = crate::chat::util::normalize_line_breaks(markdown);
|
||||
let options = Options::ENABLE_TABLES
|
||||
| Options::ENABLE_FOOTNOTES
|
||||
| Options::ENABLE_STRIKETHROUGH
|
||||
| Options::ENABLE_TASKLISTS;
|
||||
let parser = Parser::new_ext(markdown, options);
|
||||
let parser = Parser::new_ext(&normalized, options);
|
||||
let mut html_output = String::new();
|
||||
html::push_html(&mut html_output, parser);
|
||||
html_output
|
||||
@@ -80,6 +81,20 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn markdown_to_html_single_newline_prose_becomes_paragraphs() {
|
||||
// Single newlines between prose sentences should produce separate paragraphs.
|
||||
let html = markdown_to_html("Sentence one.\nSentence two.");
|
||||
assert!(
|
||||
html.contains("<p>Sentence one.</p>"),
|
||||
"expected separate paragraph for first sentence: {html}"
|
||||
);
|
||||
assert!(
|
||||
html.contains("<p>Sentence two.</p>"),
|
||||
"expected separate paragraph for second sentence: {html}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn startup_announcement_uses_bot_name() {
|
||||
assert_eq!(format_startup_announcement("Timmy"), "Timmy is online.");
|
||||
|
||||
@@ -6,9 +6,13 @@
|
||||
/// This function converts common Markdown constructs so messages render
|
||||
/// nicely in Slack instead of showing raw Markdown syntax.
|
||||
pub fn markdown_to_slack(text: &str) -> String {
|
||||
use crate::chat::util::normalize_line_breaks;
|
||||
use regex::Regex;
|
||||
use std::sync::LazyLock;
|
||||
|
||||
let normalized = normalize_line_breaks(text);
|
||||
let text = normalized.as_str();
|
||||
|
||||
// Regexes are compiled once and reused across calls.
|
||||
static RE_FENCED_BLOCK: LazyLock<Regex> =
|
||||
LazyLock::new(|| Regex::new(r"(?ms)^```.*?\n(.*?)^```").unwrap());
|
||||
|
||||
@@ -55,6 +55,9 @@ pub fn chunk_for_whatsapp(text: &str) -> Vec<String> {
|
||||
/// This function converts common Markdown constructs so messages render
|
||||
/// nicely in WhatsApp instead of showing raw Markdown syntax.
|
||||
pub fn markdown_to_whatsapp(text: &str) -> String {
|
||||
let normalized = crate::chat::util::normalize_line_breaks(text);
|
||||
let text = normalized.as_str();
|
||||
|
||||
// Regexes are compiled once and reused across calls.
|
||||
static RE_FENCED_BLOCK: LazyLock<Regex> =
|
||||
LazyLock::new(|| Regex::new(r"(?ms)^```.*?\n(.*?)^```").unwrap());
|
||||
|
||||
@@ -97,6 +97,91 @@ pub fn drain_complete_paragraphs(buffer: &mut String) -> Vec<String> {
|
||||
paragraphs
|
||||
}
|
||||
|
||||
/// Normalize single newlines between prose lines to double newlines.
|
||||
///
|
||||
/// LLMs sometimes output text with single newlines between sentences, e.g.:
|
||||
/// ```text
|
||||
/// Sentence one.
|
||||
/// Sentence two.
|
||||
/// ```
|
||||
///
|
||||
/// In Markdown a single newline is a *soft break* and may render as a space
|
||||
/// (or nothing), causing sentences to appear joined ("sentence one.Sentence
|
||||
/// two"). This function converts single newlines between non-empty prose
|
||||
/// lines into double newlines (paragraph breaks) so they render correctly.
|
||||
///
|
||||
/// Single newlines are **preserved** (not doubled) when either the preceding
|
||||
/// or following line is a structured Markdown element:
|
||||
/// - Bullet list items (`- `, `* `, `+ `)
|
||||
/// - Ordered list items (`1. `, `2. `, …)
|
||||
/// - ATX headings (`#`, `##`, …)
|
||||
/// - Table rows (`|`)
|
||||
/// - Code fence delimiters (`` ``` ``)
|
||||
///
|
||||
/// Content inside fenced code blocks is also preserved verbatim.
|
||||
pub fn normalize_line_breaks(text: &str) -> String {
|
||||
fn is_structured_line(line: &str) -> bool {
|
||||
let trimmed = line.trim_start();
|
||||
if trimmed.is_empty() {
|
||||
return false;
|
||||
}
|
||||
if trimmed.starts_with('#')
|
||||
|| trimmed.starts_with("- ")
|
||||
|| trimmed.starts_with("* ")
|
||||
|| trimmed.starts_with("+ ")
|
||||
|| trimmed.starts_with('|')
|
||||
|| trimmed.starts_with("```")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
// Ordered list: one or more digits followed by ". "
|
||||
let after_digits = trimmed.trim_start_matches(|c: char| c.is_ascii_digit());
|
||||
if !after_digits.is_empty()
|
||||
&& after_digits.starts_with(". ")
|
||||
&& after_digits.len() < trimmed.len()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
// Horizontal rules: lines made entirely of -, *, or _ (at least 3 chars).
|
||||
let all_hr_chars = trimmed
|
||||
.chars()
|
||||
.all(|c| matches!(c, '-' | '*' | '_' | ' '));
|
||||
let hr_char_count = trimmed.chars().filter(|c| !c.is_whitespace()).count();
|
||||
all_hr_chars && hr_char_count >= 3
|
||||
}
|
||||
|
||||
let lines: Vec<&str> = text.split('\n').collect();
|
||||
let mut result: Vec<&str> = Vec::with_capacity(lines.len() * 2);
|
||||
let mut in_code_fence = false;
|
||||
|
||||
for (i, &line) in lines.iter().enumerate() {
|
||||
if line.trim_start().starts_with("```") {
|
||||
in_code_fence = !in_code_fence;
|
||||
}
|
||||
|
||||
if i == 0 || in_code_fence {
|
||||
result.push(line);
|
||||
continue;
|
||||
}
|
||||
|
||||
let prev_line = lines[i - 1];
|
||||
|
||||
// Insert a blank separator when both the current and previous lines
|
||||
// are non-empty prose (not inside a code fence, not structured Markdown).
|
||||
let should_double = !line.is_empty()
|
||||
&& !prev_line.is_empty()
|
||||
&& !is_structured_line(line)
|
||||
&& !is_structured_line(prev_line);
|
||||
|
||||
if should_double {
|
||||
result.push("");
|
||||
}
|
||||
result.push(line);
|
||||
}
|
||||
|
||||
result.join("\n")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -312,4 +397,92 @@ mod tests {
|
||||
assert_eq!(all_paragraphs, vec!["First para.", "Second para."]);
|
||||
assert_eq!(buf, "Third.");
|
||||
}
|
||||
|
||||
// -- normalize_line_breaks -----------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn normalize_prose_single_newline_becomes_double() {
|
||||
let input = "Sentence one.\nSentence two.";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "Sentence one.\n\nSentence two.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_existing_double_newline_unchanged() {
|
||||
let input = "Paragraph one.\n\nParagraph two.";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "Paragraph one.\n\nParagraph two.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_bullet_list_single_newlines_preserved() {
|
||||
let input = "- item one\n- item two\n- item three";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "- item one\n- item two\n- item three");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_heading_single_newline_preserved() {
|
||||
let input = "# My Heading\nSome text below.";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "# My Heading\nSome text below.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_table_rows_single_newlines_preserved() {
|
||||
let input = "| Col A | Col B |\n| --- | --- |\n| val1 | val2 |";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "| Col A | Col B |\n| --- | --- |\n| val1 | val2 |");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_code_block_content_preserved_verbatim() {
|
||||
let input = "```rust\nlet x = 1;\nlet y = 2;\n```";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, input);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_code_block_with_blank_line_inside_preserved() {
|
||||
let input = "```\nfn foo() {\n let x = 1;\n\n let y = 2;\n}\n```";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, input);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_mixed_prose_and_code_block() {
|
||||
let input = "First sentence.\nSecond sentence.\n\n```rust\nlet x = 1;\nlet y = 2;\n```\n\nThird sentence.\nFourth sentence.";
|
||||
let output = normalize_line_breaks(input);
|
||||
// Prose sentences before and after the code block get doubled.
|
||||
// The code block itself is preserved.
|
||||
assert!(output.contains("First sentence.\n\nSecond sentence."), "prose before code: {output}");
|
||||
assert!(output.contains("```rust\nlet x = 1;\nlet y = 2;\n```"), "code block preserved: {output}");
|
||||
assert!(output.contains("Third sentence.\n\nFourth sentence."), "prose after code: {output}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_ordered_list_single_newlines_preserved() {
|
||||
let input = "1. First item\n2. Second item\n3. Third item";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "1. First item\n2. Second item\n3. Third item");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_empty_string_unchanged() {
|
||||
assert_eq!(normalize_line_breaks(""), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_single_line_unchanged() {
|
||||
assert_eq!(normalize_line_breaks("Hello."), "Hello.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_prose_then_bullet_no_extra_blank() {
|
||||
// When prose is followed by a bullet item, no extra blank is inserted
|
||||
// because the bullet line is structured.
|
||||
let input = "Some prose.\n- bullet item";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "Some prose.\n- bullet item");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user