Spaces:
Sleeping
Sleeping
| def chunk_by_token_limit(text, max_tokens, tokenizer=None): | |
| import logging | |
| from nltk.tokenize import sent_tokenize | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger("chunker") | |
| if tokenizer is None: | |
| from transformers import T5Tokenizer | |
| logger.info("π Loading default tokenizer: VincentMuriuki/legal-summarizer") | |
| tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer") | |
| logger.info("π§ Starting chunking process...") | |
| sentences = sent_tokenize(text) | |
| logger.info(f"π Total sentences found: {len(sentences)}") | |
| chunks = [] | |
| current_chunk = "" | |
| current_token_count = 0 | |
| for idx, sentence in enumerate(sentences): | |
| token_count = len(tokenizer.tokenize(sentence)) | |
| logger.debug(f"π Sentence {idx + 1}: {token_count} tokens") | |
| if current_token_count + token_count > max_tokens: | |
| if current_chunk: | |
| logger.info(f"βοΈ Chunk complete with {current_token_count} tokens") | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence | |
| current_token_count = token_count | |
| logger.info(f"π§ Starting new chunk with sentence {idx + 1}") | |
| else: | |
| if current_chunk: | |
| current_chunk += " " + sentence | |
| else: | |
| current_chunk = sentence | |
| current_token_count += token_count | |
| if current_chunk: | |
| logger.info(f"β Final chunk complete with {current_token_count} tokens") | |
| chunks.append(current_chunk.strip()) | |
| logger.info(f"π¦ Total chunks created: {len(chunks)}") | |
| return chunks | |