davidtran999 commited on
Commit
7083337
·
verified ·
1 Parent(s): aa4e737

Upload backend/hue_portal/core/chatbot.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. backend/hue_portal/core/chatbot.py +435 -0
backend/hue_portal/core/chatbot.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Chatbot with ML-based intent classification for natural language queries.
3
+ """
4
+ import re
5
+ import unicodedata
6
+ from typing import Dict, List, Tuple, Any, Optional
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sklearn.naive_bayes import MultinomialNB
9
+ from sklearn.pipeline import Pipeline
10
+ import numpy as np
11
+ from .models import Procedure, Fine, Office, Advisory
12
+ from .search_ml import search_with_ml, expand_query_with_synonyms
13
+
14
+
15
+ # Training data for intent classification
16
+ INTENT_TRAINING_DATA = {
17
+ "search_fine": [
18
+ "mức phạt", "phạt bao nhiêu", "tiền phạt", "vi phạm giao thông",
19
+ "vượt đèn đỏ", "nồng độ cồn", "không đội mũ bảo hiểm",
20
+ "mức phạt là gì", "phạt như thế nào", "hành vi vi phạm",
21
+ "điều luật", "nghị định", "mức xử phạt"
22
+ ],
23
+ "search_procedure": [
24
+ "thủ tục", "làm thủ tục", "hồ sơ", "điều kiện",
25
+ "thủ tục cư trú", "thủ tục ANTT", "thủ tục PCCC",
26
+ "cần giấy tờ gì", "làm như thế nào", "quy trình",
27
+ "thời hạn", "lệ phí", "nơi nộp"
28
+ ],
29
+ "search_office": [
30
+ "địa chỉ", "điểm tiếp dân", "công an", "phòng ban",
31
+ "số điện thoại", "giờ làm việc", "nơi tiếp nhận",
32
+ "đơn vị nào", "ở đâu", "liên hệ"
33
+ ],
34
+ "search_advisory": [
35
+ "cảnh báo", "lừa đảo", "scam", "thủ đoạn",
36
+ "cảnh giác", "an toàn", "bảo mật"
37
+ ],
38
+ "general_query": [
39
+ "xin chào", "giúp tôi", "tư vấn", "hỏi",
40
+ "thông tin", "tra cứu", "tìm kiếm"
41
+ ]
42
+ }
43
+
44
+ # Response templates
45
+ RESPONSE_TEMPLATES = {
46
+ "search_fine": "Tôi tìm thấy {count} mức phạt liên quan đến '{query}':",
47
+ "search_procedure": "Tôi tìm thấy {count} thủ tục liên quan đến '{query}':",
48
+ "search_office": "Tôi tìm thấy {count} đơn vị liên quan đến '{query}':",
49
+ "search_advisory": "Tôi tìm thấy {count} cảnh báo liên quan đến '{query}':",
50
+ "general_query": "Tôi có thể giúp bạn tra cứu các văn bản quy định pháp luật về xử lí kỷ luật cán bộ đảng viên. Bạn muốn tìm gì?",
51
+ "no_results": "Xin lỗi, tôi không tìm thấy thông tin liên quan đến '{query}'. Vui lòng thử lại với từ khóa khác.",
52
+ "greeting": "Xin chào! Tôi có thể giúp bạn tra cứu các văn bản quy định pháp luật về xử lí kỷ luật cán bộ đảng viên. Bạn cần tìm gì?",
53
+ }
54
+
55
+
56
+ class Chatbot:
57
+ def __init__(self):
58
+ self.intent_classifier = None
59
+ self.vectorizer = None
60
+ self._train_classifier()
61
+
62
+ def _train_classifier(self):
63
+ """Train intent classification model."""
64
+ try:
65
+ # Prepare training data
66
+ texts = []
67
+ labels = []
68
+
69
+ for intent, examples in INTENT_TRAINING_DATA.items():
70
+ for example in examples:
71
+ texts.append(self._preprocess_text(example))
72
+ labels.append(intent)
73
+
74
+ if not texts:
75
+ return
76
+
77
+ # Create and train pipeline
78
+ self.intent_classifier = Pipeline([
79
+ ('tfidf', TfidfVectorizer(
80
+ analyzer='word',
81
+ ngram_range=(1, 2),
82
+ min_df=1,
83
+ lowercase=True,
84
+ token_pattern=r'\b\w+\b'
85
+ )),
86
+ ('clf', MultinomialNB())
87
+ ])
88
+
89
+ self.intent_classifier.fit(texts, labels)
90
+ except Exception as e:
91
+ print(f"Error training classifier: {e}")
92
+ self.intent_classifier = None
93
+
94
+ def _preprocess_text(self, text: str) -> str:
95
+ """Preprocess text for classification - keep Vietnamese characters."""
96
+ if not text:
97
+ return ""
98
+ text = text.lower().strip()
99
+ # Only remove punctuation marks, keep all letters (including Vietnamese) and numbers
100
+ # Remove: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
101
+ text = re.sub(r'[!"#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~]', ' ', text)
102
+ text = re.sub(r'\s+', ' ', text)
103
+ return text.strip()
104
+
105
+ def _remove_accents(self, text: str) -> str:
106
+ """Remove diacritics for accent-insensitive matching."""
107
+ if not text:
108
+ return ""
109
+ normalized = unicodedata.normalize("NFD", text)
110
+ return "".join(ch for ch in normalized if unicodedata.category(ch) != "Mn")
111
+
112
+ def _keyword_in(self, query_lower: str, query_ascii: str, keyword: str) -> bool:
113
+ """Check keyword presence in either original or accent-free text."""
114
+ kw_lower = keyword.lower()
115
+ if kw_lower in query_lower:
116
+ return True
117
+ kw_ascii = self._remove_accents(kw_lower)
118
+ return kw_ascii in query_ascii
119
+
120
+ def classify_intent(self, query: str) -> Tuple[str, float]:
121
+ """
122
+ Classify user intent from query.
123
+ Returns (intent, confidence_score)
124
+ """
125
+ # Use keyword-based classification first (more reliable for Vietnamese)
126
+ keyword_intent, keyword_confidence = self._keyword_based_intent(query)
127
+
128
+ # ALWAYS use keyword-based for now (more reliable for Vietnamese)
129
+ # Special handling for greeting - only if really simple
130
+ if keyword_intent == "greeting":
131
+ query_lower = query.lower().strip()
132
+ query_ascii = self._remove_accents(query_lower)
133
+ query_words = query_lower.split()
134
+ # Double-check: if query has fine keywords, it's NOT a greeting
135
+ fine_indicators = ["phạt", "mức", "vuot", "vượt", "đèn", "den", "vi phạm", "vi pham"]
136
+ if any(self._keyword_in(query_lower, query_ascii, indicator) for indicator in fine_indicators):
137
+ # Re-check with fine keywords
138
+ for kw in ["mức phạt", "vi phạm", "đèn đỏ", "vượt đèn", "muc phat", "vuot den", "phat", "vuot", "den", "muc"]:
139
+ if self._keyword_in(query_lower, query_ascii, kw):
140
+ return ("search_fine", 0.9)
141
+ # Only return greeting if query is very short (<= 3 words)
142
+ if len(query_words) > 3:
143
+ # If long query classified as greeting, it's probably wrong - use general
144
+ return ("general_query", 0.5)
145
+
146
+ # For all other intents, use keyword-based result
147
+ return (keyword_intent, max(keyword_confidence, 0.8))
148
+
149
+ def _keyword_based_intent(self, query: str) -> Tuple[str, float]:
150
+ """Fallback keyword-based intent classification."""
151
+ # Use original query (lowercase) to preserve Vietnamese characters
152
+ query_lower = query.lower().strip()
153
+ query_ascii = self._remove_accents(query_lower)
154
+ query_words = query_lower.split()
155
+
156
+ # Check for keywords - prioritize fine-related queries FIRST
157
+ # Check on original query to preserve Vietnamese characters
158
+ # Check longer phrases first, then single words
159
+ fine_keywords = ["mức phạt", "vi phạm", "đèn đỏ", "nồng độ cồn", "mũ bảo hiểm", "tốc độ", "bằng lái", "vượt đèn", "mức phạt vượt"]
160
+ fine_keywords_ascii = [self._remove_accents(kw) for kw in fine_keywords]
161
+ fine_single_words = ["phạt", "vượt", "đèn", "mức", "phat", "vuot", "den"]
162
+
163
+ # Check multi-word keywords first
164
+ has_fine_keywords = False
165
+ for kw, kw_ascii in zip(fine_keywords, fine_keywords_ascii):
166
+ if self._keyword_in(query_lower, query_ascii, kw) or kw_ascii in query_ascii:
167
+ return ("search_fine", 0.95) # Very high confidence
168
+ # Then check single words - check ALL of them, not just first match
169
+ for kw in fine_single_words:
170
+ if self._keyword_in(query_lower, query_ascii, kw):
171
+ has_fine_keywords = True
172
+ # Return immediately if found
173
+ return ("search_fine", 0.9)
174
+
175
+ has_procedure_keywords = any(
176
+ self._keyword_in(query_lower, query_ascii, kw) for kw in
177
+ ["thủ tục", "hồ sơ", "điều kiện", "cư trú", "antt", "pccc", "thu tuc", "ho so", "dieu kien", "cu tru"]
178
+ )
179
+ if has_procedure_keywords:
180
+ return ("search_procedure", 0.8)
181
+
182
+ has_office_keywords = any(
183
+ self._keyword_in(query_lower, query_ascii, kw) for kw in
184
+ ["địa chỉ", "điểm tiếp dân", "công an", "số điện thoại", "giờ làm việc", "dia chi", "diem tiep dan", "cong an", "so dien thoai", "gio lam viec"]
185
+ )
186
+ if has_office_keywords:
187
+ return ("search_office", 0.8)
188
+
189
+ has_advisory_keywords = any(
190
+ self._keyword_in(query_lower, query_ascii, kw) for kw in
191
+ ["cảnh báo", "lừa đảo", "scam", "canh bao", "lua dao"]
192
+ )
193
+ if has_advisory_keywords:
194
+ return ("search_advisory", 0.8)
195
+
196
+ # Only treat as greeting if it's VERY short (<= 3 words) and ONLY contains greeting words
197
+ # AND does NOT contain any other keywords
198
+ has_any_keyword = (has_fine_keywords or has_procedure_keywords or
199
+ has_office_keywords or has_advisory_keywords)
200
+
201
+ if (len(query_words) <= 3 and
202
+ any(self._keyword_in(query_lower, query_ascii, kw) for kw in ["xin chào", "chào", "hello", "hi", "xin chao", "chao"]) and
203
+ not has_any_keyword):
204
+ return ("greeting", 0.9)
205
+
206
+ return ("general_query", 0.5)
207
+
208
+ def extract_keywords(self, query: str) -> List[str]:
209
+ """Extract keywords from query for search."""
210
+ # Remove common stopwords
211
+ stopwords = {"là", "gì", "bao nhiêu", "như thế nào", "ở đâu", "của", "và", "hoặc", "tôi", "bạn"}
212
+
213
+ words = re.findall(r'\b\w+\b', query.lower())
214
+ keywords = [w for w in words if w not in stopwords and len(w) > 2]
215
+
216
+ return keywords
217
+
218
+ def search_by_intent(self, intent: str, query: str, limit: int = 5) -> Dict[str, Any]:
219
+ """Search based on classified intent."""
220
+ # Use original query for better matching, especially for Vietnamese text
221
+ keywords = query.strip()
222
+ # Also try with extracted keywords as fallback
223
+ extracted = " ".join(self.extract_keywords(query))
224
+ if extracted and len(extracted) > 2:
225
+ keywords = f"{keywords} {extracted}"
226
+
227
+ results = []
228
+
229
+ if intent == "search_fine":
230
+ qs = Fine.objects.all()
231
+ text_fields = ["name", "code", "article", "decree", "remedial"]
232
+ search_results = search_with_ml(qs, keywords, text_fields, top_k=limit, min_score=0.1)
233
+ results = [{"type": "fine", "data": {
234
+ "id": f.id,
235
+ "name": f.name,
236
+ "code": f.code,
237
+ "min_fine": float(f.min_fine) if f.min_fine else None,
238
+ "max_fine": float(f.max_fine) if f.max_fine else None,
239
+ "article": f.article,
240
+ "decree": f.decree,
241
+ }} for f in search_results]
242
+
243
+ elif intent == "search_procedure":
244
+ qs = Procedure.objects.all()
245
+ text_fields = ["title", "domain", "conditions", "dossier"]
246
+ search_results = search_with_ml(qs, keywords, text_fields, top_k=limit, min_score=0.1)
247
+ results = [{"type": "procedure", "data": {
248
+ "id": p.id,
249
+ "title": p.title,
250
+ "domain": p.domain,
251
+ "level": p.level,
252
+ }} for p in search_results]
253
+
254
+ elif intent == "search_office":
255
+ qs = Office.objects.all()
256
+ text_fields = ["unit_name", "address", "district", "service_scope"]
257
+ search_results = search_with_ml(qs, keywords, text_fields, top_k=limit, min_score=0.1)
258
+ results = [{"type": "office", "data": {
259
+ "id": o.id,
260
+ "unit_name": o.unit_name,
261
+ "address": o.address,
262
+ "district": o.district,
263
+ "phone": o.phone,
264
+ "working_hours": o.working_hours,
265
+ }} for o in search_results]
266
+
267
+ elif intent == "search_advisory":
268
+ qs = Advisory.objects.all()
269
+ text_fields = ["title", "summary"]
270
+ search_results = search_with_ml(qs, keywords, text_fields, top_k=limit, min_score=0.1)
271
+ results = [{"type": "advisory", "data": {
272
+ "id": a.id,
273
+ "title": a.title,
274
+ "summary": a.summary,
275
+ }} for a in search_results]
276
+
277
+ return {
278
+ "intent": intent,
279
+ "query": query,
280
+ "keywords": keywords,
281
+ "results": results,
282
+ "count": len(results)
283
+ }
284
+
285
+ def generate_response(self, query: str, session_id: str = None) -> Dict[str, Any]:
286
+ """
287
+ Generate chatbot response for user query with Dual-Path RAG routing.
288
+
289
+ Args:
290
+ query: User query string.
291
+ session_id: Optional session ID for context.
292
+
293
+ Returns:
294
+ Dict with message, intent, results, etc.
295
+ """
296
+ import time
297
+ from hue_portal.chatbot.dual_path_router import DualPathRouter
298
+ from hue_portal.chatbot.fast_path_handler import FastPathHandler
299
+ from hue_portal.chatbot.slow_path_handler import SlowPathHandler
300
+ from hue_portal.core.models import QueryRoutingLog
301
+
302
+ query = query.strip()
303
+ start_time = time.time()
304
+
305
+ # Classify intent FIRST
306
+ intent, confidence = self.classify_intent(query)
307
+
308
+ # Route decision using Dual-Path Router
309
+ router = DualPathRouter()
310
+ route_decision = router.route(query, intent, confidence)
311
+
312
+ # Log routing decision (create log entry first, will update with response time)
313
+ routing_log = QueryRoutingLog.objects.create(
314
+ query=query[:500], # Truncate for storage
315
+ route=route_decision.path,
316
+ router_confidence=route_decision.confidence,
317
+ router_method=route_decision.method,
318
+ matched_golden_query_id=route_decision.matched_golden_query_id,
319
+ similarity_score=route_decision.similarity_score,
320
+ intent=intent,
321
+ response_time_ms=0 # Will update after
322
+ )
323
+
324
+ # Execute path
325
+ try:
326
+ if route_decision.path == "fast_path":
327
+ handler = FastPathHandler()
328
+ response = handler.handle(query, route_decision.matched_golden_query_id)
329
+ else:
330
+ handler = SlowPathHandler()
331
+ response = handler.handle(query, intent, session_id)
332
+
333
+ # Optionally save to golden dataset if high quality
334
+ if handler._should_save_to_golden(query, response):
335
+ self._save_to_golden_dataset(query, intent, response, session_id)
336
+ except Exception as e:
337
+ # Fallback to Slow Path on error
338
+ import logging
339
+ logger = logging.getLogger(__name__)
340
+ logger.error(f"Error in {route_decision.path}: {e}, falling back to Slow Path")
341
+ handler = SlowPathHandler()
342
+ response = handler.handle(query, intent, session_id)
343
+ route_decision.path = "slow_path"
344
+ route_decision.method = "fallback"
345
+
346
+ # Update log with response time
347
+ elapsed_ms = int((time.time() - start_time) * 1000)
348
+ routing_log.response_time_ms = elapsed_ms
349
+ routing_log.save(update_fields=['response_time_ms'])
350
+
351
+ # Add routing metadata to response
352
+ response['_routing'] = {
353
+ 'path': route_decision.path,
354
+ 'method': route_decision.method,
355
+ 'confidence': route_decision.confidence
356
+ }
357
+
358
+ return response
359
+
360
+ def _save_to_golden_dataset(
361
+ self,
362
+ query: str,
363
+ intent: str,
364
+ response: Dict[str, Any],
365
+ session_id: Optional[str] = None
366
+ ) -> None:
367
+ """
368
+ Save high-quality response to golden dataset for future Fast Path use.
369
+
370
+ Args:
371
+ query: User query.
372
+ intent: Detected intent.
373
+ response: Response dict to save.
374
+ session_id: Optional session ID.
375
+ """
376
+ try:
377
+ from hue_portal.core.models import GoldenQuery
378
+ from hue_portal.chatbot.slow_path_handler import SlowPathHandler
379
+ import unicodedata
380
+ import re
381
+
382
+ # Normalize query
383
+ normalized = query.lower().strip()
384
+ normalized = unicodedata.normalize("NFD", normalized)
385
+ normalized = "".join(ch for ch in normalized if unicodedata.category(ch) != "Mn")
386
+ normalized = re.sub(r'\s+', ' ', normalized).strip()
387
+
388
+ # Check if already exists
389
+ if GoldenQuery.objects.filter(query_normalized=normalized, is_active=True).exists():
390
+ return
391
+
392
+ # Generate embedding for semantic search (optional, can be done async)
393
+ query_embedding = None
394
+ try:
395
+ from hue_portal.core.embeddings import get_embedding_model
396
+ embedding_model = get_embedding_model()
397
+ if embedding_model:
398
+ embedding = embedding_model.encode(query, convert_to_numpy=True)
399
+ query_embedding = embedding.tolist()
400
+ except Exception:
401
+ pass # Embedding generation is optional
402
+
403
+ # Create golden query entry
404
+ GoldenQuery.objects.create(
405
+ query=query,
406
+ query_normalized=normalized,
407
+ query_embedding=query_embedding,
408
+ intent=intent,
409
+ response_message=response.get("message", ""),
410
+ response_data=response,
411
+ verified_by="slow_path_auto", # Auto-saved from Slow Path
412
+ accuracy_score=response.get("confidence", 0.95),
413
+ is_active=True
414
+ )
415
+
416
+ import logging
417
+ logger = logging.getLogger(__name__)
418
+ logger.info(f"Saved query to golden dataset: {query[:50]}...")
419
+
420
+ except Exception as e:
421
+ import logging
422
+ logger = logging.getLogger(__name__)
423
+ logger.warning(f"Error saving to golden dataset: {e}")
424
+
425
+
426
+ # Global chatbot instance
427
+ _chatbot_instance = None
428
+
429
+ def get_chatbot() -> Chatbot:
430
+ """Get or create chatbot instance."""
431
+ global _chatbot_instance
432
+ if _chatbot_instance is None:
433
+ _chatbot_instance = Chatbot()
434
+ return _chatbot_instance
435
+