storkit: merge 427_story_server_side_text_normalization_for_chat_message_line_breaks
This commit is contained in:
@@ -97,6 +97,91 @@ pub fn drain_complete_paragraphs(buffer: &mut String) -> Vec<String> {
|
||||
paragraphs
|
||||
}
|
||||
|
||||
/// Normalize single newlines between prose lines to double newlines.
|
||||
///
|
||||
/// LLMs sometimes output text with single newlines between sentences, e.g.:
|
||||
/// ```text
|
||||
/// Sentence one.
|
||||
/// Sentence two.
|
||||
/// ```
|
||||
///
|
||||
/// In Markdown a single newline is a *soft break* and may render as a space
|
||||
/// (or nothing), causing sentences to appear joined ("sentence one.Sentence
|
||||
/// two"). This function converts single newlines between non-empty prose
|
||||
/// lines into double newlines (paragraph breaks) so they render correctly.
|
||||
///
|
||||
/// Single newlines are **preserved** (not doubled) when either the preceding
|
||||
/// or following line is a structured Markdown element:
|
||||
/// - Bullet list items (`- `, `* `, `+ `)
|
||||
/// - Ordered list items (`1. `, `2. `, …)
|
||||
/// - ATX headings (`#`, `##`, …)
|
||||
/// - Table rows (`|`)
|
||||
/// - Code fence delimiters (`` ``` ``)
|
||||
///
|
||||
/// Content inside fenced code blocks is also preserved verbatim.
|
||||
pub fn normalize_line_breaks(text: &str) -> String {
|
||||
fn is_structured_line(line: &str) -> bool {
|
||||
let trimmed = line.trim_start();
|
||||
if trimmed.is_empty() {
|
||||
return false;
|
||||
}
|
||||
if trimmed.starts_with('#')
|
||||
|| trimmed.starts_with("- ")
|
||||
|| trimmed.starts_with("* ")
|
||||
|| trimmed.starts_with("+ ")
|
||||
|| trimmed.starts_with('|')
|
||||
|| trimmed.starts_with("```")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
// Ordered list: one or more digits followed by ". "
|
||||
let after_digits = trimmed.trim_start_matches(|c: char| c.is_ascii_digit());
|
||||
if !after_digits.is_empty()
|
||||
&& after_digits.starts_with(". ")
|
||||
&& after_digits.len() < trimmed.len()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
// Horizontal rules: lines made entirely of -, *, or _ (at least 3 chars).
|
||||
let all_hr_chars = trimmed
|
||||
.chars()
|
||||
.all(|c| matches!(c, '-' | '*' | '_' | ' '));
|
||||
let hr_char_count = trimmed.chars().filter(|c| !c.is_whitespace()).count();
|
||||
all_hr_chars && hr_char_count >= 3
|
||||
}
|
||||
|
||||
let lines: Vec<&str> = text.split('\n').collect();
|
||||
let mut result: Vec<&str> = Vec::with_capacity(lines.len() * 2);
|
||||
let mut in_code_fence = false;
|
||||
|
||||
for (i, &line) in lines.iter().enumerate() {
|
||||
if line.trim_start().starts_with("```") {
|
||||
in_code_fence = !in_code_fence;
|
||||
}
|
||||
|
||||
if i == 0 || in_code_fence {
|
||||
result.push(line);
|
||||
continue;
|
||||
}
|
||||
|
||||
let prev_line = lines[i - 1];
|
||||
|
||||
// Insert a blank separator when both the current and previous lines
|
||||
// are non-empty prose (not inside a code fence, not structured Markdown).
|
||||
let should_double = !line.is_empty()
|
||||
&& !prev_line.is_empty()
|
||||
&& !is_structured_line(line)
|
||||
&& !is_structured_line(prev_line);
|
||||
|
||||
if should_double {
|
||||
result.push("");
|
||||
}
|
||||
result.push(line);
|
||||
}
|
||||
|
||||
result.join("\n")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -312,4 +397,92 @@ mod tests {
|
||||
assert_eq!(all_paragraphs, vec!["First para.", "Second para."]);
|
||||
assert_eq!(buf, "Third.");
|
||||
}
|
||||
|
||||
// -- normalize_line_breaks -----------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn normalize_prose_single_newline_becomes_double() {
|
||||
let input = "Sentence one.\nSentence two.";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "Sentence one.\n\nSentence two.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_existing_double_newline_unchanged() {
|
||||
let input = "Paragraph one.\n\nParagraph two.";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "Paragraph one.\n\nParagraph two.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_bullet_list_single_newlines_preserved() {
|
||||
let input = "- item one\n- item two\n- item three";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "- item one\n- item two\n- item three");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_heading_single_newline_preserved() {
|
||||
let input = "# My Heading\nSome text below.";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "# My Heading\nSome text below.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_table_rows_single_newlines_preserved() {
|
||||
let input = "| Col A | Col B |\n| --- | --- |\n| val1 | val2 |";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "| Col A | Col B |\n| --- | --- |\n| val1 | val2 |");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_code_block_content_preserved_verbatim() {
|
||||
let input = "```rust\nlet x = 1;\nlet y = 2;\n```";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, input);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_code_block_with_blank_line_inside_preserved() {
|
||||
let input = "```\nfn foo() {\n let x = 1;\n\n let y = 2;\n}\n```";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, input);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_mixed_prose_and_code_block() {
|
||||
let input = "First sentence.\nSecond sentence.\n\n```rust\nlet x = 1;\nlet y = 2;\n```\n\nThird sentence.\nFourth sentence.";
|
||||
let output = normalize_line_breaks(input);
|
||||
// Prose sentences before and after the code block get doubled.
|
||||
// The code block itself is preserved.
|
||||
assert!(output.contains("First sentence.\n\nSecond sentence."), "prose before code: {output}");
|
||||
assert!(output.contains("```rust\nlet x = 1;\nlet y = 2;\n```"), "code block preserved: {output}");
|
||||
assert!(output.contains("Third sentence.\n\nFourth sentence."), "prose after code: {output}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_ordered_list_single_newlines_preserved() {
|
||||
let input = "1. First item\n2. Second item\n3. Third item";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "1. First item\n2. Second item\n3. Third item");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_empty_string_unchanged() {
|
||||
assert_eq!(normalize_line_breaks(""), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_single_line_unchanged() {
|
||||
assert_eq!(normalize_line_breaks("Hello."), "Hello.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_prose_then_bullet_no_extra_blank() {
|
||||
// When prose is followed by a bullet item, no extra blank is inserted
|
||||
// because the bullet line is structured.
|
||||
let input = "Some prose.\n- bullet item";
|
||||
let output = normalize_line_breaks(input);
|
||||
assert_eq!(output, "Some prose.\n- bullet item");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user