""" Text Utilities Module Smart text processing for node labels and content display. Preserves word boundaries and handles multi-line content. """ import re from typing import Optional def smart_truncate(text: str, max_length: int = 50, suffix: str = "...") -> str: """ Truncate text at word boundaries. Args: text: Text to truncate max_length: Maximum length before truncation suffix: Suffix to add when truncated Returns: Truncated text preserving whole words """ if not text: return "" text = text.strip() if len(text) <= max_length: return text # Find last space before max_length truncate_at = max_length - len(suffix) # Try to break at word boundary last_space = text.rfind(" ", 0, truncate_at) if last_space > max_length * 0.5: # Only if reasonable amount preserved return text[:last_space].rstrip() + suffix # Fall back to hard truncation return text[:truncate_at].rstrip() + suffix def smart_truncate_multiline( text: str, max_lines: int = 3, max_line_length: int = 50 ) -> str: """ Truncate multi-line text intelligently. Args: text: Multi-line text max_lines: Maximum number of lines max_line_length: Maximum length per line Returns: Formatted multi-line text """ if not text: return "" lines = text.split("\n") result_lines = [] for i, line in enumerate(lines[:max_lines]): truncated = smart_truncate(line.strip(), max_line_length) if truncated: result_lines.append(truncated) if len(lines) > max_lines: result_lines.append("...") return "\n".join(result_lines) def create_node_label( content: str, node_type: str = "default", max_length: Optional[int] = None ) -> str: """ Create display label for a graph node. Different node types get different truncation limits to optimize readability. Args: content: Full node content node_type: Type of node max_length: Override max length Returns: Formatted label for display """ if not content: return "..." # Type-specific limits (optimized for visualization) type_limits = { "query": 45, "reasoning": 50, "hypothesis": 40, "conclusion": 50, "fact": 35, "evidence": 35, "constraint": 30, "ghost": 30, "default": 40, } limit = max_length or type_limits.get(node_type, type_limits["default"]) return smart_truncate(content, limit) def extract_key_terms(text: str, max_terms: int = 5) -> list: """ Extract key terms from text for search/matching. Simple extraction based on word frequency and length. For production, consider using TF-IDF or KeyBERT. Args: text: Text to extract terms from max_terms: Maximum terms to return Returns: List of key terms """ if not text: return [] # Clean and tokenize text = text.lower() words = re.findall(r'\b[a-z]{3,}\b', text) # Filter stop words (basic list) stop_words = { "the", "and", "for", "are", "but", "not", "you", "all", "can", "had", "her", "was", "one", "our", "out", "has", "his", "how", "its", "may", "new", "now", "old", "see", "way", "who", "boy", "did", "get", "let", "put", "say", "she", "too", "use", "with", "from", "have", "this", "that", "been", "your", "than", "they", "will", "more", "when", } words = [w for w in words if w not in stop_words] # Count and sort by frequency word_counts = {} for word in words: word_counts[word] = word_counts.get(word, 0) + 1 sorted_words = sorted( word_counts.items(), key=lambda x: (x[1], len(x[0])), # Sort by count, then length reverse=True ) return [word for word, _ in sorted_words[:max_terms]] def format_confidence(confidence: float) -> str: """ Format confidence score for display. Args: confidence: Score between 0 and 1 Returns: Formatted percentage string """ if confidence < 0: confidence = 0 elif confidence > 1: confidence = 1 return f"{confidence:.0%}" def sanitize_content(text: str) -> str: """ Sanitize text content for safe display. Removes or escapes potentially problematic characters. Args: text: Raw text Returns: Sanitized text """ if not text: return "" # Remove control characters text = "".join(c for c in text if c.isprintable() or c in "\n\t") # Normalize whitespace text = re.sub(r'\s+', ' ', text) return text.strip() def highlight_terms(text: str, terms: list) -> str: """ Highlight terms in text (for search results). Returns text with terms wrapped in markers. Note: For HTML output, convert markers to tags. Args: text: Text to highlight in terms: Terms to highlight Returns: Text with highlighted terms """ if not text or not terms: return text result = text for term in terms: # Case-insensitive replacement with markers pattern = re.compile(re.escape(term), re.IGNORECASE) result = pattern.sub(f"**{term}**", result) return result