Spaces:
Sleeping
Sleeping
Commit
·
3a494fe
1
Parent(s):
0d901a7
style: Complete remaining SIM108 improvements and ruff formatting
Browse files- Apply final ternary operator simplification in content_extractor.py
- Include ruff formatter automatic improvements to code style
- Now only 3 ruff errors remain (2 SIM117, 1 SIM108)
These are all minor style improvements that can be addressed if desired.
.pre-commit-hooks/run_staged_tests.py
CHANGED
|
@@ -11,9 +11,7 @@ import time
|
|
| 11 |
from typing import List, Set
|
| 12 |
|
| 13 |
# ロギング設定
|
| 14 |
-
logging.basicConfig(
|
| 15 |
-
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 16 |
-
)
|
| 17 |
logger = logging.getLogger("run_staged_tests")
|
| 18 |
|
| 19 |
|
|
@@ -70,9 +68,7 @@ def get_test_files_to_run(staged_files: List[str]) -> Set[str]:
|
|
| 70 |
check=True,
|
| 71 |
)
|
| 72 |
for test_file in matching_tests.stdout.strip().split("\n"):
|
| 73 |
-
if
|
| 74 |
-
test_file and "test_audio_generator.py" not in test_file
|
| 75 |
-
): # Skip empty lines and problematic test
|
| 76 |
test_files.add(test_file)
|
| 77 |
except subprocess.CalledProcessError:
|
| 78 |
pass
|
|
@@ -95,11 +91,7 @@ def run_pytest(test_files: Set[str]) -> bool:
|
|
| 95 |
venv_pytest = "venv/bin/python -m pytest"
|
| 96 |
|
| 97 |
# Use venv pytest if available, otherwise try system pytest
|
| 98 |
-
if os.path.exists("venv/bin/python")
|
| 99 |
-
# タイムアウト(秒)を指定して実行
|
| 100 |
-
cmd = f"{venv_pytest} {' '.join(test_files)} -v --timeout=30"
|
| 101 |
-
else:
|
| 102 |
-
cmd = f"python -m pytest {' '.join(test_files)} -v --timeout=30"
|
| 103 |
|
| 104 |
logger.info(f"Running: {cmd}")
|
| 105 |
|
|
|
|
| 11 |
from typing import List, Set
|
| 12 |
|
| 13 |
# ロギング設定
|
| 14 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
|
|
|
|
|
| 15 |
logger = logging.getLogger("run_staged_tests")
|
| 16 |
|
| 17 |
|
|
|
|
| 68 |
check=True,
|
| 69 |
)
|
| 70 |
for test_file in matching_tests.stdout.strip().split("\n"):
|
| 71 |
+
if test_file and "test_audio_generator.py" not in test_file: # Skip empty lines and problematic test
|
|
|
|
|
|
|
| 72 |
test_files.add(test_file)
|
| 73 |
except subprocess.CalledProcessError:
|
| 74 |
pass
|
|
|
|
| 91 |
venv_pytest = "venv/bin/python -m pytest"
|
| 92 |
|
| 93 |
# Use venv pytest if available, otherwise try system pytest
|
| 94 |
+
cmd = f"{venv_pytest} {' '.join(test_files)} -v --timeout=30" if os.path.exists("venv/bin/python") else f"python -m pytest {' '.join(test_files)} -v --timeout=30"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
logger.info(f"Running: {cmd}")
|
| 97 |
|
tests/e2e/conftest.py
CHANGED
|
@@ -94,10 +94,7 @@ def browser():
|
|
| 94 |
Browser: Playwrightブラウザインスタンス
|
| 95 |
"""
|
| 96 |
with sync_playwright() as playwright:
|
| 97 |
-
if os.environ.get("HEADLESS", "true").lower() == "true"
|
| 98 |
-
browser = playwright.chromium.launch(headless=True)
|
| 99 |
-
else:
|
| 100 |
-
browser = playwright.chromium.launch(headless=False, slow_mo=100)
|
| 101 |
|
| 102 |
yield browser
|
| 103 |
|
|
@@ -125,9 +122,7 @@ def pytest_bdd_apply_tag(tag, function):
|
|
| 125 |
return None
|
| 126 |
|
| 127 |
|
| 128 |
-
def pytest_bdd_step_error(
|
| 129 |
-
request, feature, scenario, step, step_func, step_func_args, exception
|
| 130 |
-
):
|
| 131 |
"""
|
| 132 |
ステップが失敗した場合のフック
|
| 133 |
|
|
@@ -146,9 +141,7 @@ def pytest_bdd_step_error(
|
|
| 146 |
step_name = step.name.replace(" ", "_")
|
| 147 |
timestamp = int(time.time())
|
| 148 |
|
| 149 |
-
screenshot_path = os.path.join(
|
| 150 |
-
screenshot_dir, f"error_{scenario_name}_{step_name}_{timestamp}.png"
|
| 151 |
-
)
|
| 152 |
|
| 153 |
page.screenshot(path=screenshot_path)
|
| 154 |
logger.error(f"スクリーンショットが保存されました: {screenshot_path}")
|
|
|
|
| 94 |
Browser: Playwrightブラウザインスタンス
|
| 95 |
"""
|
| 96 |
with sync_playwright() as playwright:
|
| 97 |
+
browser = playwright.chromium.launch(headless=True) if os.environ.get("HEADLESS", "true").lower() == "true" else playwright.chromium.launch(headless=False, slow_mo=100)
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
yield browser
|
| 100 |
|
|
|
|
| 122 |
return None
|
| 123 |
|
| 124 |
|
| 125 |
+
def pytest_bdd_step_error(request, feature, scenario, step, step_func, step_func_args, exception):
|
|
|
|
|
|
|
| 126 |
"""
|
| 127 |
ステップが失敗した場合のフック
|
| 128 |
|
|
|
|
| 141 |
step_name = step.name.replace(" ", "_")
|
| 142 |
timestamp = int(time.time())
|
| 143 |
|
| 144 |
+
screenshot_path = os.path.join(screenshot_dir, f"error_{scenario_name}_{step_name}_{timestamp}.png")
|
|
|
|
|
|
|
| 145 |
|
| 146 |
page.screenshot(path=screenshot_path)
|
| 147 |
logger.error(f"スクリーンショットが保存されました: {screenshot_path}")
|
yomitalk/components/audio_generator.py
CHANGED
|
@@ -70,13 +70,8 @@ class VoicevoxCoreManager:
|
|
| 70 |
self.core_initialized = False
|
| 71 |
|
| 72 |
# 1. Check existence of required directories
|
| 73 |
-
if (
|
| 74 |
-
not
|
| 75 |
-
or not self.VOICEVOX_DICT_PATH.exists()
|
| 76 |
-
):
|
| 77 |
-
logger.warning(
|
| 78 |
-
"Required VOICEVOX directories not found. Please run 'make download-voicevox-core'"
|
| 79 |
-
)
|
| 80 |
return
|
| 81 |
|
| 82 |
try:
|
|
@@ -84,9 +79,7 @@ class VoicevoxCoreManager:
|
|
| 84 |
open_jtalk = self._initialize_openjtalk()
|
| 85 |
|
| 86 |
# 3. Initialize ONNX Runtime
|
| 87 |
-
runtime_path = str(
|
| 88 |
-
self.VOICEVOX_LIB_PATH / "libvoicevox_onnxruntime.so.1.17.3"
|
| 89 |
-
)
|
| 90 |
|
| 91 |
if os.path.exists(runtime_path):
|
| 92 |
logger.info("Loading ONNX runtime from local path")
|
|
@@ -102,9 +95,7 @@ class VoicevoxCoreManager:
|
|
| 102 |
loaded_count = self._load_voice_models()
|
| 103 |
|
| 104 |
if loaded_count > 0:
|
| 105 |
-
logger.info(
|
| 106 |
-
f"Successfully loaded {loaded_count}/{len(REQUIRED_MODEL_FILES)} voice models"
|
| 107 |
-
)
|
| 108 |
self.core_initialized = True
|
| 109 |
else:
|
| 110 |
logger.error("No voice models could be loaded")
|
|
@@ -229,16 +220,12 @@ class VoicevoxCoreManager:
|
|
| 229 |
if original_surface != word.surface:
|
| 230 |
self.user_dict_words.add(original_surface)
|
| 231 |
|
| 232 |
-
logger.debug(
|
| 233 |
-
f"Loaded user dict word: {word.surface} (original: {original_surface})"
|
| 234 |
-
)
|
| 235 |
|
| 236 |
except Exception as e:
|
| 237 |
logger.warning(f"Failed to load user dictionary words: {e}")
|
| 238 |
|
| 239 |
-
logger.info(
|
| 240 |
-
f"Loaded {len(self.user_dict_words)} user dictionary surface forms for conversion checking"
|
| 241 |
-
)
|
| 242 |
|
| 243 |
def is_word_in_user_dict(self, word: str) -> bool:
|
| 244 |
"""
|
|
@@ -386,12 +373,8 @@ class AudioGenerator:
|
|
| 386 |
If not provided, defaults to "data/temp/talks"
|
| 387 |
"""
|
| 388 |
# Use session-specific directories if provided
|
| 389 |
-
self.output_dir = (
|
| 390 |
-
|
| 391 |
-
)
|
| 392 |
-
self.temp_dir = (
|
| 393 |
-
session_temp_dir if session_temp_dir else Path("data/temp/talks")
|
| 394 |
-
)
|
| 395 |
|
| 396 |
# Make sure directories exist
|
| 397 |
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -454,9 +437,7 @@ class AudioGenerator:
|
|
| 454 |
result.extend([uppercase_part, "ズ"])
|
| 455 |
else:
|
| 456 |
# 英単語のパターンに基づいて分割(キャメルケース対応)
|
| 457 |
-
segments = re.findall(
|
| 458 |
-
r"([A-Z]{2,}(?=[A-Z][a-z]|$)|[A-Z][a-z]*|[a-z]+)", part
|
| 459 |
-
)
|
| 460 |
result.extend(segments)
|
| 461 |
else:
|
| 462 |
# 英単語以外はそのまま追加
|
|
@@ -509,11 +490,7 @@ class AudioGenerator:
|
|
| 509 |
needs_space = word_count >= 6 # 6単語以上続く
|
| 510 |
|
| 511 |
# 特定の品詞の前後で息継ぎ
|
| 512 |
-
if (
|
| 513 |
-
last_part.lower() in self.BE_VERBS
|
| 514 |
-
or part.lower() in self.PREPOSITIONS
|
| 515 |
-
or part.lower() in self.CONJUNCTIONS
|
| 516 |
-
) and word_count >= 4:
|
| 517 |
needs_space = True
|
| 518 |
|
| 519 |
if needs_space:
|
|
@@ -530,9 +507,7 @@ class AudioGenerator:
|
|
| 530 |
elif not is_english_word:
|
| 531 |
# 英単語でない場合はそのまま
|
| 532 |
part_to_add = part
|
| 533 |
-
elif is_all_uppercase and (
|
| 534 |
-
len(part) <= 3 or (len(part) <= 6 and not is_romaji_readable(part))
|
| 535 |
-
):
|
| 536 |
# 大文字のみで構成され、字数が少なくてローマ字読みできない場合はアルファベット読みして欲しいためそのまま
|
| 537 |
# (字数が3文字以下なら基本的にアルファベット���みで良く, 駄目であればCONVERSION_OVERRIDEなどで変換する)
|
| 538 |
part_to_add = part
|
|
@@ -546,9 +521,7 @@ class AudioGenerator:
|
|
| 546 |
|
| 547 |
return "".join(result)
|
| 548 |
|
| 549 |
-
def generate_character_conversation(
|
| 550 |
-
self, podcast_text: str
|
| 551 |
-
) -> Generator[Optional[str], None, None]:
|
| 552 |
"""
|
| 553 |
Generate audio for a character conversation from podcast text with streaming support.
|
| 554 |
|
|
@@ -614,10 +587,7 @@ class AudioGenerator:
|
|
| 614 |
conversation_parts = []
|
| 615 |
|
| 616 |
# キャラクターパターンを取得
|
| 617 |
-
character_patterns = {
|
| 618 |
-
char.display_name: [f"{char.display_name}:", f"{char.display_name}:"]
|
| 619 |
-
for char in Character
|
| 620 |
-
}
|
| 621 |
|
| 622 |
# 複数行のセリフを処理するために現在の話者と発言を記録
|
| 623 |
current_speaker = None
|
|
@@ -663,9 +633,7 @@ class AudioGenerator:
|
|
| 663 |
|
| 664 |
# 会話部分が見つからない場合はフォーマット修正を試みる
|
| 665 |
if not conversation_parts:
|
| 666 |
-
logger.warning(
|
| 667 |
-
"No valid conversation parts found. Attempting to fix format..."
|
| 668 |
-
)
|
| 669 |
fixed_text = self._fix_conversation_format(podcast_text)
|
| 670 |
if fixed_text != podcast_text:
|
| 671 |
return self._extract_conversation_parts(fixed_text)
|
|
@@ -801,10 +769,9 @@ class AudioGenerator:
|
|
| 801 |
# 現在の話者の発言として処理
|
| 802 |
if line_stripped:
|
| 803 |
current_speech.append(line_stripped)
|
| 804 |
-
elif current_speech:
|
| 805 |
# 段落区切りの空行
|
| 806 |
-
|
| 807 |
-
current_speech[-1] += "\n"
|
| 808 |
elif line_stripped:
|
| 809 |
# 話者が一度も検出されていない場合はデフォルト設定
|
| 810 |
current_speaker = Character.ZUNDAMON.display_name
|
|
|
|
| 70 |
self.core_initialized = False
|
| 71 |
|
| 72 |
# 1. Check existence of required directories
|
| 73 |
+
if not self.VOICEVOX_MODELS_PATH.exists() or not self.VOICEVOX_DICT_PATH.exists():
|
| 74 |
+
logger.warning("Required VOICEVOX directories not found. Please run 'make download-voicevox-core'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
return
|
| 76 |
|
| 77 |
try:
|
|
|
|
| 79 |
open_jtalk = self._initialize_openjtalk()
|
| 80 |
|
| 81 |
# 3. Initialize ONNX Runtime
|
| 82 |
+
runtime_path = str(self.VOICEVOX_LIB_PATH / "libvoicevox_onnxruntime.so.1.17.3")
|
|
|
|
|
|
|
| 83 |
|
| 84 |
if os.path.exists(runtime_path):
|
| 85 |
logger.info("Loading ONNX runtime from local path")
|
|
|
|
| 95 |
loaded_count = self._load_voice_models()
|
| 96 |
|
| 97 |
if loaded_count > 0:
|
| 98 |
+
logger.info(f"Successfully loaded {loaded_count}/{len(REQUIRED_MODEL_FILES)} voice models")
|
|
|
|
|
|
|
| 99 |
self.core_initialized = True
|
| 100 |
else:
|
| 101 |
logger.error("No voice models could be loaded")
|
|
|
|
| 220 |
if original_surface != word.surface:
|
| 221 |
self.user_dict_words.add(original_surface)
|
| 222 |
|
| 223 |
+
logger.debug(f"Loaded user dict word: {word.surface} (original: {original_surface})")
|
|
|
|
|
|
|
| 224 |
|
| 225 |
except Exception as e:
|
| 226 |
logger.warning(f"Failed to load user dictionary words: {e}")
|
| 227 |
|
| 228 |
+
logger.info(f"Loaded {len(self.user_dict_words)} user dictionary surface forms for conversion checking")
|
|
|
|
|
|
|
| 229 |
|
| 230 |
def is_word_in_user_dict(self, word: str) -> bool:
|
| 231 |
"""
|
|
|
|
| 373 |
If not provided, defaults to "data/temp/talks"
|
| 374 |
"""
|
| 375 |
# Use session-specific directories if provided
|
| 376 |
+
self.output_dir = session_output_dir if session_output_dir else Path("data/output")
|
| 377 |
+
self.temp_dir = session_temp_dir if session_temp_dir else Path("data/temp/talks")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
|
| 379 |
# Make sure directories exist
|
| 380 |
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 437 |
result.extend([uppercase_part, "ズ"])
|
| 438 |
else:
|
| 439 |
# 英単語のパターンに基づいて分割(キャメルケース対応)
|
| 440 |
+
segments = re.findall(r"([A-Z]{2,}(?=[A-Z][a-z]|$)|[A-Z][a-z]*|[a-z]+)", part)
|
|
|
|
|
|
|
| 441 |
result.extend(segments)
|
| 442 |
else:
|
| 443 |
# 英単語以外はそのまま追加
|
|
|
|
| 490 |
needs_space = word_count >= 6 # 6単語以上続く
|
| 491 |
|
| 492 |
# 特定の品詞の前後で息継ぎ
|
| 493 |
+
if (last_part.lower() in self.BE_VERBS or part.lower() in self.PREPOSITIONS or part.lower() in self.CONJUNCTIONS) and word_count >= 4:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
needs_space = True
|
| 495 |
|
| 496 |
if needs_space:
|
|
|
|
| 507 |
elif not is_english_word:
|
| 508 |
# 英単語でない場合はそのまま
|
| 509 |
part_to_add = part
|
| 510 |
+
elif is_all_uppercase and (len(part) <= 3 or (len(part) <= 6 and not is_romaji_readable(part))):
|
|
|
|
|
|
|
| 511 |
# 大文字のみで構成され、字数が少なくてローマ字読みできない場合はアルファベット読みして欲しいためそのまま
|
| 512 |
# (字数が3文字以下なら基本的にアルファベット���みで良く, 駄目であればCONVERSION_OVERRIDEなどで変換する)
|
| 513 |
part_to_add = part
|
|
|
|
| 521 |
|
| 522 |
return "".join(result)
|
| 523 |
|
| 524 |
+
def generate_character_conversation(self, podcast_text: str) -> Generator[Optional[str], None, None]:
|
|
|
|
|
|
|
| 525 |
"""
|
| 526 |
Generate audio for a character conversation from podcast text with streaming support.
|
| 527 |
|
|
|
|
| 587 |
conversation_parts = []
|
| 588 |
|
| 589 |
# キャラクターパターンを取得
|
| 590 |
+
character_patterns = {char.display_name: [f"{char.display_name}:", f"{char.display_name}:"] for char in Character}
|
|
|
|
|
|
|
|
|
|
| 591 |
|
| 592 |
# 複数行のセリフを処理するために現在の話者と発言を記録
|
| 593 |
current_speaker = None
|
|
|
|
| 633 |
|
| 634 |
# 会話部分が見つからない場合はフォーマット修正を試みる
|
| 635 |
if not conversation_parts:
|
| 636 |
+
logger.warning("No valid conversation parts found. Attempting to fix format...")
|
|
|
|
|
|
|
| 637 |
fixed_text = self._fix_conversation_format(podcast_text)
|
| 638 |
if fixed_text != podcast_text:
|
| 639 |
return self._extract_conversation_parts(fixed_text)
|
|
|
|
| 769 |
# 現在の話者の発言として処理
|
| 770 |
if line_stripped:
|
| 771 |
current_speech.append(line_stripped)
|
| 772 |
+
elif current_speech and not current_speech[-1].endswith("\n"):
|
| 773 |
# 段落区切りの空行
|
| 774 |
+
current_speech[-1] += "\n"
|
|
|
|
| 775 |
elif line_stripped:
|
| 776 |
# 話者が一度も検出されていない場合はデフォルト設定
|
| 777 |
current_speaker = Character.ZUNDAMON.display_name
|
yomitalk/components/content_extractor.py
CHANGED
|
@@ -75,9 +75,7 @@ class ContentExtractor:
|
|
| 75 |
return f"URL conversion error: {str(e)}"
|
| 76 |
|
| 77 |
@classmethod
|
| 78 |
-
def extract_file_content(
|
| 79 |
-
cls, file_obj: Any
|
| 80 |
-
) -> Tuple[Optional[str], Optional[bytes]]:
|
| 81 |
"""
|
| 82 |
メモリ上でファイルコンテンツを抽出します。
|
| 83 |
|
|
@@ -99,9 +97,7 @@ class ContentExtractor:
|
|
| 99 |
original_extension = ".txt" # デフォルト拡張子
|
| 100 |
if hasattr(file_obj, "name"):
|
| 101 |
# 元のファイルの拡張子を取得
|
| 102 |
-
original_extension = os.path.splitext(Path(file_obj.name).name)[
|
| 103 |
-
1
|
| 104 |
-
].lower()
|
| 105 |
# 拡張子がない場合はデフォルト値を使用
|
| 106 |
if not original_extension:
|
| 107 |
original_extension = ".txt"
|
|
@@ -110,10 +106,7 @@ class ContentExtractor:
|
|
| 110 |
file_content = None
|
| 111 |
if hasattr(file_obj, "read") and callable(file_obj.read):
|
| 112 |
# 現在位置を記録
|
| 113 |
-
if hasattr(file_obj, "tell") and callable(file_obj.tell)
|
| 114 |
-
pos = file_obj.tell()
|
| 115 |
-
else:
|
| 116 |
-
pos = 0
|
| 117 |
|
| 118 |
# コンテンツを読み込み
|
| 119 |
file_content = file_obj.read()
|
|
@@ -205,9 +198,7 @@ class ContentExtractor:
|
|
| 205 |
|
| 206 |
# メモリ上のPDFストリームを直接変換
|
| 207 |
logger.debug("Processing PDF from memory stream")
|
| 208 |
-
result = _markdown_converter.convert(
|
| 209 |
-
pdf_stream, stream_info=stream_info
|
| 210 |
-
)
|
| 211 |
|
| 212 |
# 変換結果からテキストコンテンツを取得
|
| 213 |
markdown_content = result.text_content
|
|
@@ -221,9 +212,7 @@ class ContentExtractor:
|
|
| 221 |
return f"Unsupported file type: {file_ext}. Supported types: {', '.join(cls.SUPPORTED_EXTENSIONS)}"
|
| 222 |
|
| 223 |
@classmethod
|
| 224 |
-
def append_text_with_source(
|
| 225 |
-
cls, existing_text: str, new_text: str, source: str, add_separator: bool = True
|
| 226 |
-
) -> str:
|
| 227 |
"""
|
| 228 |
Append new text to existing text with source information.
|
| 229 |
|
|
@@ -245,18 +234,10 @@ class ContentExtractor:
|
|
| 245 |
if add_separator:
|
| 246 |
# Create markdown-style separator with source information
|
| 247 |
separator = f"\n\n---\n**Source: {source}**\n\n"
|
| 248 |
-
if existing_text.strip():
|
| 249 |
-
# If there's existing text, add separator before new content
|
| 250 |
-
result = existing_text.rstrip() + separator + content_to_append
|
| 251 |
-
else:
|
| 252 |
-
# If no existing text, add source info at the beginning
|
| 253 |
-
result = f"**Source: {source}**\n\n" + content_to_append
|
| 254 |
else:
|
| 255 |
# Just append with minimal spacing
|
| 256 |
-
if existing_text.strip()
|
| 257 |
-
result = existing_text.rstrip() + "\n\n" + content_to_append
|
| 258 |
-
else:
|
| 259 |
-
result = content_to_append
|
| 260 |
|
| 261 |
return result
|
| 262 |
|
|
|
|
| 75 |
return f"URL conversion error: {str(e)}"
|
| 76 |
|
| 77 |
@classmethod
|
| 78 |
+
def extract_file_content(cls, file_obj: Any) -> Tuple[Optional[str], Optional[bytes]]:
|
|
|
|
|
|
|
| 79 |
"""
|
| 80 |
メモリ上でファイルコンテンツを抽出します。
|
| 81 |
|
|
|
|
| 97 |
original_extension = ".txt" # デフォルト拡張子
|
| 98 |
if hasattr(file_obj, "name"):
|
| 99 |
# 元のファイルの拡張子を取得
|
| 100 |
+
original_extension = os.path.splitext(Path(file_obj.name).name)[1].lower()
|
|
|
|
|
|
|
| 101 |
# 拡張子がない場合はデフォルト値を使用
|
| 102 |
if not original_extension:
|
| 103 |
original_extension = ".txt"
|
|
|
|
| 106 |
file_content = None
|
| 107 |
if hasattr(file_obj, "read") and callable(file_obj.read):
|
| 108 |
# 現在位置を記録
|
| 109 |
+
pos = file_obj.tell() if hasattr(file_obj, "tell") and callable(file_obj.tell) else 0
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
# コンテンツを読み込み
|
| 112 |
file_content = file_obj.read()
|
|
|
|
| 198 |
|
| 199 |
# メモリ上のPDFストリームを直接変換
|
| 200 |
logger.debug("Processing PDF from memory stream")
|
| 201 |
+
result = _markdown_converter.convert(pdf_stream, stream_info=stream_info)
|
|
|
|
|
|
|
| 202 |
|
| 203 |
# 変換結果からテキストコンテンツを取得
|
| 204 |
markdown_content = result.text_content
|
|
|
|
| 212 |
return f"Unsupported file type: {file_ext}. Supported types: {', '.join(cls.SUPPORTED_EXTENSIONS)}"
|
| 213 |
|
| 214 |
@classmethod
|
| 215 |
+
def append_text_with_source(cls, existing_text: str, new_text: str, source: str, add_separator: bool = True) -> str:
|
|
|
|
|
|
|
| 216 |
"""
|
| 217 |
Append new text to existing text with source information.
|
| 218 |
|
|
|
|
| 234 |
if add_separator:
|
| 235 |
# Create markdown-style separator with source information
|
| 236 |
separator = f"\n\n---\n**Source: {source}**\n\n"
|
| 237 |
+
result = existing_text.rstrip() + separator + content_to_append if existing_text.strip() else f"**Source: {source}**\n\n" + content_to_append
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
else:
|
| 239 |
# Just append with minimal spacing
|
| 240 |
+
result = existing_text.rstrip() + "\n\n" + content_to_append if existing_text.strip() else content_to_append
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
return result
|
| 243 |
|