Spaces:

sagar008
/

nyaynetra-summarizer

Sleeping

App Files Files Community

sagar008 commited on Jun 17

Commit

c44d8fc

verified ·

1 Parent(s): c07963e

Update chunker.py

Browse files

adding logs for debug

Files changed (1) hide show

chunker.py +17 -1

chunker.py CHANGED Viewed

@@ -1,21 +1,34 @@
 def chunk_by_token_limit(text, max_tokens, tokenizer=None):
     from nltk.tokenize import sent_tokenize
     if tokenizer is None:
         from transformers import T5Tokenizer
         tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer")
     sentences = sent_tokenize(text)
     chunks = []
     current_chunk = ""
     current_token_count = 0
-    for sentence in sentences:
         token_count = len(tokenizer.tokenize(sentence))
         if current_token_count + token_count > max_tokens:
             if current_chunk:
                 chunks.append(current_chunk.strip())
             current_chunk = sentence
             current_token_count = token_count
         else:
             if current_chunk:
                 current_chunk += " " + sentence
@@ -24,7 +37,10 @@ def chunk_by_token_limit(text, max_tokens, tokenizer=None):
             current_token_count += token_count
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks

 def chunk_by_token_limit(text, max_tokens, tokenizer=None):
+    import logging
     from nltk.tokenize import sent_tokenize
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger("chunker")
     if tokenizer is None:
         from transformers import T5Tokenizer
+        logger.info("🔄 Loading default tokenizer: VincentMuriuki/legal-summarizer")
         tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer")
+    logger.info("🧠 Starting chunking process...")
     sentences = sent_tokenize(text)
+    logger.info(f"📄 Total sentences found: {len(sentences)}")
     chunks = []
     current_chunk = ""
     current_token_count = 0
+    for idx, sentence in enumerate(sentences):
         token_count = len(tokenizer.tokenize(sentence))
+        logger.debug(f"🔍 Sentence {idx + 1}: {token_count} tokens")
         if current_token_count + token_count > max_tokens:
             if current_chunk:
+                logger.info(f"✂️ Chunk complete with {current_token_count} tokens")
                 chunks.append(current_chunk.strip())
             current_chunk = sentence
             current_token_count = token_count
+            logger.info(f"🚧 Starting new chunk with sentence {idx + 1}")
         else:
             if current_chunk:
                 current_chunk += " " + sentence
             current_token_count += token_count
     if current_chunk:
+        logger.info(f"✅ Final chunk complete with {current_token_count} tokens")
         chunks.append(current_chunk.strip())
+    logger.info(f"📦 Total chunks created: {len(chunks)}")
     return chunks