Files
huskies/server/src/chat/util.rs
T

316 lines
11 KiB
Rust

//! Shared text utilities used by all chat transports.
//!
//! These functions are transport-agnostic helpers for processing chat messages:
//! prefix stripping, bot-mention handling, and paragraph buffering.
/// Case-insensitive prefix strip that also requires the match to end at a
/// word boundary (whitespace, punctuation, or end-of-string).
pub fn strip_prefix_ci<'a>(text: &'a str, prefix: &str) -> Option<&'a str> {
let candidate = text.get(..prefix.len())?;
if !candidate.eq_ignore_ascii_case(prefix) {
return None;
}
let rest = &text[prefix.len()..];
// Must be at end or followed by non-alphanumeric
match rest.chars().next() {
None => Some(rest), // exact match, empty remainder
Some(c) if c.is_alphanumeric() || c == '-' || c == '_' => None, // not a word boundary
_ => Some(rest),
}
}
/// Strip the bot mention prefix from a raw message body.
///
/// Handles these forms (case-insensitive where applicable):
/// - `@bot_localpart:server.com rest` → `rest`
/// - `@bot_localpart rest` → `rest`
/// - `DisplayName rest` → `rest`
pub fn strip_bot_mention<'a>(message: &'a str, bot_name: &str, bot_user_id: &str) -> &'a str {
let trimmed = message.trim();
// Try full Matrix user ID (e.g. "@timmy:homeserver.local")
if let Some(rest) = strip_prefix_ci(trimmed, bot_user_id) {
return rest;
}
// Try @localpart (e.g. "@timmy")
if let Some(localpart) = bot_user_id.split(':').next()
&& let Some(rest) = strip_prefix_ci(trimmed, localpart)
{
return rest;
}
// Try display name (e.g. "Timmy")
if let Some(rest) = strip_prefix_ci(trimmed, bot_name) {
return rest;
}
trimmed
}
/// Returns `true` when `text` ends while inside an open fenced code block.
///
/// A fenced code block opens and closes on lines that start with ` ``` `
/// (three or more backticks). We count the fence markers and return `true`
/// when the count is odd (a fence was opened but not yet closed).
fn is_inside_code_fence(text: &str) -> bool {
let mut in_fence = false;
for line in text.lines() {
if line.trim_start().starts_with("```") {
in_fence = !in_fence;
}
}
in_fence
}
/// Drain all complete paragraphs from `buffer` and return them.
///
/// A paragraph boundary is a double newline (`\n\n`). Each drained paragraph
/// is trimmed of surrounding whitespace; empty paragraphs are discarded.
/// The buffer is left with only the remaining incomplete text.
///
/// **Code-fence awareness:** a `\n\n` that occurs *inside* a fenced code
/// block (delimited by ` ``` ` lines) is **not** treated as a paragraph
/// boundary. This prevents a blank line inside a code block from splitting
/// the fence across multiple messages, which would corrupt the rendering.
pub fn drain_complete_paragraphs(buffer: &mut String) -> Vec<String> {
let mut paragraphs = Vec::new();
let mut search_from = 0;
loop {
let Some(pos) = buffer[search_from..].find("\n\n") else {
break;
};
let abs_pos = search_from + pos;
// Only split at this boundary when we are NOT inside a code fence.
if is_inside_code_fence(&buffer[..abs_pos]) {
// Skip past this \n\n and keep looking for the next boundary.
search_from = abs_pos + 2;
} else {
let chunk = buffer[..abs_pos].trim().to_string();
*buffer = buffer[abs_pos + 2..].to_string();
search_from = 0;
if !chunk.is_empty() {
paragraphs.push(chunk);
}
}
}
paragraphs
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
// -- strip_prefix_ci ----------------------------------------------------
#[test]
fn strip_prefix_ci_basic() {
assert_eq!(strip_prefix_ci("Hello world", "hello"), Some(" world"));
}
#[test]
fn strip_prefix_ci_no_match() {
assert_eq!(strip_prefix_ci("goodbye", "hello"), None);
}
#[test]
fn strip_prefix_ci_word_boundary_required() {
assert_eq!(strip_prefix_ci("helloworld", "hello"), None);
}
#[test]
fn strip_prefix_ci_exact_match() {
assert_eq!(strip_prefix_ci("hello", "hello"), Some(""));
}
#[test]
fn strip_prefix_ci_multibyte_no_panic_smart_quote() {
// "abcde\u{2019}xyz" — U+2019 is 3 bytes starting at byte 5.
// A prefix of length 6 (e.g. "abcdef") lands inside the 3-byte char.
// Previously this caused: "byte index 6 is not a char boundary".
let text = "abcde\u{2019}xyz";
assert_eq!(strip_prefix_ci(text, "abcdef"), None);
}
#[test]
fn strip_prefix_ci_multibyte_no_panic_emoji() {
// U+1F600 is 4 bytes starting at byte 3. Prefix length 4 lands inside it.
let text = "abc\u{1F600}def";
assert_eq!(strip_prefix_ci(text, "abcd"), None);
}
// -- strip_bot_mention --------------------------------------------------
#[test]
fn strip_mention_full_user_id() {
let rest = strip_bot_mention(
"@timmy:homeserver.local help",
"Timmy",
"@timmy:homeserver.local",
);
assert_eq!(rest.trim(), "help");
}
#[test]
fn strip_mention_localpart() {
let rest = strip_bot_mention("@timmy help me", "Timmy", "@timmy:homeserver.local");
assert_eq!(rest.trim(), "help me");
}
#[test]
fn strip_mention_display_name() {
let rest = strip_bot_mention("Timmy help", "Timmy", "@timmy:homeserver.local");
assert_eq!(rest.trim(), "help");
}
#[test]
fn strip_mention_display_name_case_insensitive() {
let rest = strip_bot_mention("timmy help", "Timmy", "@timmy:homeserver.local");
assert_eq!(rest.trim(), "help");
}
#[test]
fn strip_mention_no_match_returns_original() {
let rest = strip_bot_mention("hello world", "Timmy", "@timmy:homeserver.local");
assert_eq!(rest, "hello world");
}
#[test]
fn strip_mention_does_not_match_longer_name() {
// "@timmybot" should NOT match "@timmy"
let rest = strip_bot_mention("@timmybot help", "Timmy", "@timmy:homeserver.local");
assert_eq!(rest, "@timmybot help");
}
#[test]
fn strip_mention_comma_after_name() {
let rest = strip_bot_mention("@timmy, help", "Timmy", "@timmy:homeserver.local");
assert_eq!(rest.trim().trim_start_matches(',').trim(), "help");
}
// -- drain_complete_paragraphs ------------------------------------------
#[test]
fn drain_complete_paragraphs_no_boundary_returns_empty() {
let mut buf = "Hello World".to_string();
let paras = drain_complete_paragraphs(&mut buf);
assert!(paras.is_empty());
assert_eq!(buf, "Hello World");
}
#[test]
fn drain_complete_paragraphs_single_boundary() {
let mut buf = "Paragraph one.\n\nParagraph two.".to_string();
let paras = drain_complete_paragraphs(&mut buf);
assert_eq!(paras, vec!["Paragraph one."]);
assert_eq!(buf, "Paragraph two.");
}
#[test]
fn drain_complete_paragraphs_multiple_boundaries() {
let mut buf = "A\n\nB\n\nC".to_string();
let paras = drain_complete_paragraphs(&mut buf);
assert_eq!(paras, vec!["A", "B"]);
assert_eq!(buf, "C");
}
#[test]
fn drain_complete_paragraphs_trailing_boundary() {
let mut buf = "A\n\nB\n\n".to_string();
let paras = drain_complete_paragraphs(&mut buf);
assert_eq!(paras, vec!["A", "B"]);
assert_eq!(buf, "");
}
#[test]
fn drain_complete_paragraphs_empty_input() {
let mut buf = String::new();
let paras = drain_complete_paragraphs(&mut buf);
assert!(paras.is_empty());
assert_eq!(buf, "");
}
#[test]
fn drain_complete_paragraphs_skips_empty_chunks() {
// Consecutive double-newlines produce no empty paragraphs.
let mut buf = "\n\n\n\nHello".to_string();
let paras = drain_complete_paragraphs(&mut buf);
assert!(paras.is_empty());
assert_eq!(buf, "Hello");
}
#[test]
fn drain_complete_paragraphs_trims_whitespace() {
let mut buf = " Hello \n\n World ".to_string();
let paras = drain_complete_paragraphs(&mut buf);
assert_eq!(paras, vec!["Hello"]);
assert_eq!(buf, " World ");
}
// -- drain_complete_paragraphs: code-fence awareness -------------------
#[test]
fn drain_complete_paragraphs_code_fence_blank_line_not_split() {
// A blank line inside a fenced code block must NOT trigger a split.
let mut buf =
"```rust\nfn foo() {\n let x = 1;\n\n let y = 2;\n}\n```\n\nNext paragraph."
.to_string();
let paras = drain_complete_paragraphs(&mut buf);
assert_eq!(
paras.len(),
1,
"code fence with blank line should not be split into multiple messages: {paras:?}"
);
assert!(
paras[0].starts_with("```rust"),
"first paragraph should be the code fence: {:?}",
paras[0]
);
assert!(
paras[0].contains("let y = 2;"),
"code fence should contain content from both sides of the blank line: {:?}",
paras[0]
);
assert_eq!(buf, "Next paragraph.");
}
#[test]
fn drain_complete_paragraphs_text_before_and_after_fenced_block() {
// Text paragraph, then a code block with an internal blank line, then more text.
let mut buf = "Before\n\n```\ncode\n\nmore code\n```\n\nAfter".to_string();
let paras = drain_complete_paragraphs(&mut buf);
assert_eq!(paras.len(), 2, "expected two paragraphs: {paras:?}");
assert_eq!(paras[0], "Before");
assert!(
paras[1].starts_with("```"),
"second paragraph should be the code fence: {:?}",
paras[1]
);
assert!(
paras[1].contains("more code"),
"code fence content must include the part after the blank line: {:?}",
paras[1]
);
assert_eq!(buf, "After");
}
#[test]
fn drain_complete_paragraphs_incremental_simulation() {
// Simulate tokens arriving one character at a time.
let mut buf = String::new();
let mut all_paragraphs = Vec::new();
for ch in "First para.\n\nSecond para.\n\nThird.".chars() {
buf.push(ch);
all_paragraphs.extend(drain_complete_paragraphs(&mut buf));
}
assert_eq!(all_paragraphs, vec!["First para.", "Second para."]);
assert_eq!(buf, "Third.");
}
}