storkit: merge 427_story_server_side_text_normalization_for_chat_message_line_breaks

This commit is contained in:
dave
2026-03-28 10:39:13 +00:00
parent 1ae2fa9b9b
commit 52513b55ff
4 changed files with 196 additions and 1 deletions
+173
View File
@@ -97,6 +97,91 @@ pub fn drain_complete_paragraphs(buffer: &mut String) -> Vec<String> {
paragraphs
}
/// Normalize single newlines between prose lines to double newlines.
///
/// LLMs sometimes output text with single newlines between sentences, e.g.:
/// ```text
/// Sentence one.
/// Sentence two.
/// ```
///
/// In Markdown a single newline is a *soft break* and may render as a space
/// (or nothing), causing sentences to appear joined ("sentence one.Sentence
/// two"). This function converts single newlines between non-empty prose
/// lines into double newlines (paragraph breaks) so they render correctly.
///
/// Single newlines are **preserved** (not doubled) when either the preceding
/// or following line is a structured Markdown element:
/// - Bullet list items (`- `, `* `, `+ `)
/// - Ordered list items (`1. `, `2. `, …)
/// - ATX headings (`#`, `##`, …)
/// - Table rows (`|`)
/// - Code fence delimiters (`` ``` ``)
///
/// Content inside fenced code blocks is also preserved verbatim.
pub fn normalize_line_breaks(text: &str) -> String {
fn is_structured_line(line: &str) -> bool {
let trimmed = line.trim_start();
if trimmed.is_empty() {
return false;
}
if trimmed.starts_with('#')
|| trimmed.starts_with("- ")
|| trimmed.starts_with("* ")
|| trimmed.starts_with("+ ")
|| trimmed.starts_with('|')
|| trimmed.starts_with("```")
{
return true;
}
// Ordered list: one or more digits followed by ". "
let after_digits = trimmed.trim_start_matches(|c: char| c.is_ascii_digit());
if !after_digits.is_empty()
&& after_digits.starts_with(". ")
&& after_digits.len() < trimmed.len()
{
return true;
}
// Horizontal rules: lines made entirely of -, *, or _ (at least 3 chars).
let all_hr_chars = trimmed
.chars()
.all(|c| matches!(c, '-' | '*' | '_' | ' '));
let hr_char_count = trimmed.chars().filter(|c| !c.is_whitespace()).count();
all_hr_chars && hr_char_count >= 3
}
let lines: Vec<&str> = text.split('\n').collect();
let mut result: Vec<&str> = Vec::with_capacity(lines.len() * 2);
let mut in_code_fence = false;
for (i, &line) in lines.iter().enumerate() {
if line.trim_start().starts_with("```") {
in_code_fence = !in_code_fence;
}
if i == 0 || in_code_fence {
result.push(line);
continue;
}
let prev_line = lines[i - 1];
// Insert a blank separator when both the current and previous lines
// are non-empty prose (not inside a code fence, not structured Markdown).
let should_double = !line.is_empty()
&& !prev_line.is_empty()
&& !is_structured_line(line)
&& !is_structured_line(prev_line);
if should_double {
result.push("");
}
result.push(line);
}
result.join("\n")
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
@@ -312,4 +397,92 @@ mod tests {
assert_eq!(all_paragraphs, vec!["First para.", "Second para."]);
assert_eq!(buf, "Third.");
}
// -- normalize_line_breaks -----------------------------------------------
#[test]
fn normalize_prose_single_newline_becomes_double() {
let input = "Sentence one.\nSentence two.";
let output = normalize_line_breaks(input);
assert_eq!(output, "Sentence one.\n\nSentence two.");
}
#[test]
fn normalize_existing_double_newline_unchanged() {
let input = "Paragraph one.\n\nParagraph two.";
let output = normalize_line_breaks(input);
assert_eq!(output, "Paragraph one.\n\nParagraph two.");
}
#[test]
fn normalize_bullet_list_single_newlines_preserved() {
let input = "- item one\n- item two\n- item three";
let output = normalize_line_breaks(input);
assert_eq!(output, "- item one\n- item two\n- item three");
}
#[test]
fn normalize_heading_single_newline_preserved() {
let input = "# My Heading\nSome text below.";
let output = normalize_line_breaks(input);
assert_eq!(output, "# My Heading\nSome text below.");
}
#[test]
fn normalize_table_rows_single_newlines_preserved() {
let input = "| Col A | Col B |\n| --- | --- |\n| val1 | val2 |";
let output = normalize_line_breaks(input);
assert_eq!(output, "| Col A | Col B |\n| --- | --- |\n| val1 | val2 |");
}
#[test]
fn normalize_code_block_content_preserved_verbatim() {
let input = "```rust\nlet x = 1;\nlet y = 2;\n```";
let output = normalize_line_breaks(input);
assert_eq!(output, input);
}
#[test]
fn normalize_code_block_with_blank_line_inside_preserved() {
let input = "```\nfn foo() {\n let x = 1;\n\n let y = 2;\n}\n```";
let output = normalize_line_breaks(input);
assert_eq!(output, input);
}
#[test]
fn normalize_mixed_prose_and_code_block() {
let input = "First sentence.\nSecond sentence.\n\n```rust\nlet x = 1;\nlet y = 2;\n```\n\nThird sentence.\nFourth sentence.";
let output = normalize_line_breaks(input);
// Prose sentences before and after the code block get doubled.
// The code block itself is preserved.
assert!(output.contains("First sentence.\n\nSecond sentence."), "prose before code: {output}");
assert!(output.contains("```rust\nlet x = 1;\nlet y = 2;\n```"), "code block preserved: {output}");
assert!(output.contains("Third sentence.\n\nFourth sentence."), "prose after code: {output}");
}
#[test]
fn normalize_ordered_list_single_newlines_preserved() {
let input = "1. First item\n2. Second item\n3. Third item";
let output = normalize_line_breaks(input);
assert_eq!(output, "1. First item\n2. Second item\n3. Third item");
}
#[test]
fn normalize_empty_string_unchanged() {
assert_eq!(normalize_line_breaks(""), "");
}
#[test]
fn normalize_single_line_unchanged() {
assert_eq!(normalize_line_breaks("Hello."), "Hello.");
}
#[test]
fn normalize_prose_then_bullet_no_extra_blank() {
// When prose is followed by a bullet item, no extra blank is inserted
// because the bullet line is structured.
let input = "Some prose.\n- bullet item";
let output = normalize_line_breaks(input);
assert_eq!(output, "Some prose.\n- bullet item");
}
}