Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

App Files Files Community

takomattyy commited on 23 days ago

Commit

3f04313

verified ·

1 Parent(s): a08bca9

Upload 21 files

Browse files

updated to add analyze_documents

Files changed (4) hide show

README.md +29 -1
analyze_documents.py +283 -0
app.py +49 -0
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -35,6 +35,7 @@ Philippine ID and Document OCR Extraction Service using PaddleOCR
 ### Additional Features
 - **Document Analysis** - Automatic document type identification
 ## 🚀 Quick Start
@@ -63,8 +64,9 @@ All extraction endpoints accept POST requests with the following format:
 - `POST /api/extract-police-clearance` - Extract Police Clearance
 - `POST /api/extract-tesda` - Extract TESDA Certificate
-#### Analysis Endpoint
 - `POST /api/analyze-document` - Identify document type
 #### Utility Endpoints
 - `GET /health` - Health check
@@ -94,6 +96,32 @@ print(result)
 #     "full_name": "Juan Dela Cruz",
 #     "birth_date": "1990-01-15"
 # }
 ```
 ### cURL Example

 ### Additional Features
 - **Document Analysis** - Automatic document type identification
+- **Document Tampering Detection** - Analyze multiple documents for tampering using Error Level Analysis (ELA) and metadata inspection
 ## 🚀 Quick Start
 - `POST /api/extract-police-clearance` - Extract Police Clearance
 - `POST /api/extract-tesda` - Extract TESDA Certificate
+#### Analysis Endpoints
 - `POST /api/analyze-document` - Identify document type
+- `POST /api/analyze-documents` - Analyze multiple documents for tampering (max 3)
 #### Utility Endpoints
 - `GET /health` - Health check
 #     "full_name": "Juan Dela Cruz",
 #     "birth_date": "1990-01-15"
 # }
+# Analyze multiple documents for tampering
+response = requests.post(
+    'https://YOUR-SPACE.hf.space/api/analyze-documents',
+    json={'image_urls': [
+        'https://example.com/id1.jpg',
+        'https://example.com/id2.jpg'
+    ]}
+)
+tampering_result = response.json()
+print(tampering_result)
+# Expected output:
+# {
+#     "success": true,
+#     "total_documents": 2,
+#     "results": [
+#         {
+#             "document_id": "doc_1",
+#             "tampering_results": {"tampered": "False", "brightness_ratio": 0.015},
+#             "metadata_results": {"result": "success", "message": "..."}
+#         },
+#         ...
+#     ]
+# }
 ```
 ### cURL Example

analyze_documents.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import sys, json, requests, exifread, numpy as np
+from PIL import Image, ImageChops, ImageEnhance
+from io import BytesIO
+import os
+import uuid
+software_list = ['photoshop', 'lightroom', 'gimp', 'paint', 'paint.net', 'paintshop pro', 'paintshop pro x', 'paintshop pro x2', 'paintshop pro x3', 'paintshop pro x4', 'paintshop pro x5', 'paintshop pro x6', 'paintshop pro x7', 'paintshop pro x8', 'paintshop pro x9', 'paintshop pro x10']
+def download_image(url, output_path=None):
+    if output_path is None:
+        # Generate unique filename to avoid conflicts
+        unique_id = str(uuid.uuid4()).replace('-', '')[:8]  # Fixed UUID handling
+        output_path = f'temp_image_{unique_id}.jpg'
+    print(f"DOCUMENT Starting download for URL: {url}", file=sys.stderr)
+    # Handle URL that might be a JSON string
+    if url.startswith('{') or url.startswith('['):
+        try:
+            url_data = json.loads(url)
+            print(f"DOCUMENT Parsed JSON URL data: {url_data}", file=sys.stderr)
+            if isinstance(url_data, dict) and 'url' in url_data:
+                url = url_data['url']
+                print(f"DOCUMENT Extracted URL from dict: {url}", file=sys.stderr)
+            elif isinstance(url_data, list) and len(url_data) > 0:
+                url = url_data[0] if isinstance(url_data[0], str) else url_data[0].get('url', '')
+                print(f"DOCUMENT Extracted URL from list: {url}", file=sys.stderr)
+        except json.JSONDecodeError as e:
+            print(f"DOCUMENT Error parsing URL JSON: {url}, Error: {str(e)}", file=sys.stderr)
+            return None
+    if not url or url == '':
+        print(f"DOCUMENT Empty URL after processing", file=sys.stderr)
+        return None
+    try:
+        print(f"DOCUMENT Attempting to download from: {url}", file=sys.stderr)
+        response = requests.get(url, timeout=30)  # Added timeout
+        response.raise_for_status()
+        image_data = response.content
+        print(f"DOCUMENT Downloaded image from {url}", file=sys.stderr)
+        print(f"DOCUMENT Image data size: {len(image_data)} bytes", file=sys.stderr)
+        # Now process the image
+        image = Image.open(BytesIO(image_data))
+        print(f"DOCUMENT Original image format: {image.format}", file=sys.stderr)
+        print(f"DOCUMENT Original image mode: {image.mode}", file=sys.stderr)
+        # Convert to RGB if necessary (JPEG doesn't support alpha channel)
+        if image.mode == 'RGBA':
+            # Create a white background
+            background = Image.new('RGB', image.size, (255, 255, 255))
+            # Paste the image onto the background
+            background.paste(image, mask=image.split()[-1])  # Use alpha channel as mask
+            image = background
+            print(f"DOCUMENT Converted RGBA to RGB", file=sys.stderr)
+        elif image.mode != 'RGB':
+            image = image.convert('RGB')
+            print(f"DOCUMENT Converted {image.mode} to RGB", file=sys.stderr)
+        # Save as JPG (simplified - don't try to preserve EXIF for now)
+        image.save(output_path, 'JPEG', quality=95)
+        print(f"DOCUMENT Saved image to {output_path}", file=sys.stderr)
+        return output_path
+    except requests.exceptions.RequestException as e:
+        print(f"DOCUMENT Request error downloading image: {str(e)}", file=sys.stderr)
+        return None
+    except Exception as e:
+        print(f"DOCUMENT Error downloading image: {str(e)}", file=sys.stderr)
+        return None
+def perform_error_level_analysis(image_path, output_path=None, quality=90):
+    if output_path is None:
+        # Generate unique filename for ELA output
+        base_name = os.path.splitext(image_path)[0]
+        output_path = f"{base_name}_ELA.png"
+    print(f"DOCUMENT TAMPERING Starting Error Level Analysis...", file=sys.stderr)
+    original_image = Image.open(image_path)
+    # Convert RGBA to RGB if necessary (JPEG doesn't support alpha channel)
+    if original_image.mode == 'RGBA':
+        # Create a white background
+        background = Image.new('RGB', original_image.size, (255, 255, 255))
+        # Paste the image onto the background
+        background.paste(original_image, mask=original_image.split()[-1])  # Use alpha channel as mask
+        original_image = background
+    elif original_image.mode != 'RGB':
+        original_image = original_image.convert('RGB')
+    # Use unique temp file for each document
+    temp_path = f"temp_ela_{str(uuid.uuid4()).replace('-', '')[:8]}.jpg"  # Fixed UUID handling
+    original_image.save(temp_path, "JPEG", quality=quality)
+    resaved_image = Image.open(temp_path)
+    difference = ImageChops.difference(original_image, resaved_image)
+    difference = ImageEnhance.Brightness(difference).enhance(10)
+    difference.save(output_path)
+    # Clean up temp file
+    try:
+        os.remove(temp_path)
+    except:
+        pass
+    print(f"DOCUMENT TAMPERING ELA analysis completed, saved to {output_path}", file=sys.stderr)
+    return output_path
+def detect_tampering(ela_path, threshold=100):
+    print(f"DOCUMENT TAMPERING Analyzing ELA results...", file=sys.stderr)
+    ela_image = Image.open(ela_path)
+    ela_array = np.array(ela_image)
+    bright_pixels = np.sum(ela_array > threshold)
+    total_pixels = ela_array.size
+    brightness_ratio = bright_pixels / total_pixels
+    print(f"DOCUMENT TAMPERING Brightness ratio: {brightness_ratio:.4f}", file=sys.stderr)
+    if brightness_ratio < 0.02:
+        return {
+            "tampered": "False",
+            "brightness_ratio": round(float(brightness_ratio), 4)
+        }
+    elif brightness_ratio > 0.02 and brightness_ratio <= 0.05:
+        return {
+            "tampered": "Investigation Required",
+            "brightness_ratio": round(float(brightness_ratio), 4)
+        }
+    elif brightness_ratio > 0.05:
+        return {
+            "tampered": "True",
+            "brightness_ratio": round(float(brightness_ratio), 4)
+        }
+def analyze_metadata(image_path):
+    print(f"DOCUMENT METADATA Starting metadata analysis...", file=sys.stderr)
+    try:
+        with open(image_path, 'rb') as f:
+            tags = exifread.process_file(f)
+            metadata = {tag: str(tags.get(tag)) for tag in tags.keys()}
+        # Debug: Print what we found
+        print(f"DOCUMENT METADATA Found {len(metadata)} metadata tags", file=sys.stderr)
+        if metadata:
+            print(f"DOCUMENT METADATA Metadata keys: {list(metadata.keys())}", file=sys.stderr)
+    except Exception as e:
+        print(f"DOCUMENT METADATA Error reading metadata: {str(e)}", file=sys.stderr)
+        return {
+            "result": "error",
+            "message": f"Error reading metadata: {str(e)}",
+            "metadata": {}
+        }
+    if not metadata:
+        print(f"DOCUMENT METADATA No metadata found in {image_path}", file=sys.stderr)
+        return {
+            "result": "no metadata",
+            "message": "No metadata found in image.",
+            "metadata": metadata
+        }
+    # Check for timestamp metadata
+    has_timestamp = any(key in metadata for key in ['EXIF DateTimeOriginal', 'Image DateTime', 'EXIF DateTime'])
+    # Check for editing software
+    software_used = metadata.get('Image Software', '').lower()
+    # Check for camera information
+    has_camera_info = any(key in metadata for key in ['Image Make', 'Image Model', 'EXIF Make', 'EXIF Model'])
+    # Provide more detailed analysis
+    analysis_summary = []
+    if has_timestamp:
+        analysis_summary.append("Timestamp found")
+    if has_camera_info:
+        analysis_summary.append("Camera information found")
+    if software_used:
+        analysis_summary.append(f"Software detected: {software_used}")
+    if not has_timestamp and not has_camera_info and not software_used:
+        return {
+            "result": "minimal metadata",
+            "message": "Image contains minimal metadata (no timestamp, camera info, or editing software detected)",
+            "metadata": metadata,
+            "analysis": "Image appears to be legitimate with no signs of editing"
+        }
+    if any(term in software_used for term in software_list):
+        return {
+            "result": "edited",
+            "message": f"Image appears to have been edited with software: {software_used}",
+            "metadata": metadata,
+            "analysis": "Suspicious editing software detected"
+        }
+    return {
+        "result": "success",
+        "message": f"Successfully analyzed metadata: {', '.join(analysis_summary)}",
+        "metadata": metadata,
+        "analysis": "Image appears to be legitimate with no signs of editing"
+    }
+def analyze_single_document(image_url, document_id=None):
+    """Analyze a single document and return results"""
+    try:
+        # Generate unique filename for this document
+        unique_id = str(uuid.uuid4()).replace('-', '')[:8]  # Fixed UUID handling
+        image_path = download_image(image_url, f'temp_image_{unique_id}.jpg')
+        if image_path is None:
+            return {
+                "document_id": document_id or unique_id,
+                "image_url": image_url,
+                "success": False,
+                "error": "Failed to download image"
+            }
+        # ELA and Tampering
+        ela_path = perform_error_level_analysis(image_path, f'ELA_{unique_id}.png')
+        tampering_results = detect_tampering(ela_path)
+        # Metadata
+        metadata_results = analyze_metadata(image_path)
+        # Clean up temporary files
+        try:
+            os.remove(image_path)
+            os.remove(ela_path)
+        except:
+            pass
+        return {
+            "document_id": document_id or unique_id,
+            "image_url": image_url,
+            "success": True,
+            "tampering_results": tampering_results,
+            "metadata_results": metadata_results
+        }
+    except Exception as e:
+        return {
+            "document_id": document_id or "unknown",
+            "image_url": image_url,
+            "success": False,
+            "error": str(e)
+        }
+# Main function for batch processing
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print(json.dumps({"error": "No image URLs provided"}))
+        sys.exit(1)
+    # Get all URLs from command line arguments
+    image_urls = sys.argv[1:]
+    # Limit to 3 documents as requested
+    if len(image_urls) > 3:
+        print(json.dumps({"error": "Maximum 3 documents allowed"}))
+        sys.exit(1)
+    results = []
+    # Process each document
+    for i, url in enumerate(image_urls):
+        print(f"Processing document {i+1}/{len(image_urls)}: {url}", file=sys.stderr)
+        result = analyze_single_document(url, f"doc_{i+1}")
+        results.append(result)
+    # Return combined results
+    print(json.dumps({
+        "success": True,
+        "total_documents": len(results),
+        "results": results
+    }))

app.py CHANGED Viewed

@@ -282,6 +282,50 @@ def api_analyze_document():
     except Exception as e:
         return jsonify({'success': False, 'error': str(e)}), 500
 # ============================================================================
 # UTILITY ENDPOINTS
 # ============================================================================
@@ -355,6 +399,11 @@ def index():
                 'POST /api/analyze-document': {
                     'description': 'Analyze and identify document type from image',
                     'fields': ['document_type', 'confidence']
                 }
             },
             'Utility': {

     except Exception as e:
         return jsonify({'success': False, 'error': str(e)}), 500
+@app.route('/api/analyze-documents', methods=['POST'])
+def api_analyze_documents():
+    """Analyze multiple documents for tampering detection and metadata"""
+    try:
+        data = request.json
+        image_urls = data.get('image_urls', [])
+        if not image_urls:
+            return jsonify({'error': 'Missing image_urls array'}), 400
+        if len(image_urls) > 3:
+            return jsonify({'error': 'Maximum 3 documents allowed'}), 400
+        # Run analyze_documents.py with multiple URLs
+        cmd = [sys.executable, 'analyze_documents.py'] + image_urls
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=300,
+            cwd=os.getcwd()
+        )
+        if result.returncode != 0:
+            return jsonify({
+                'success': False,
+                'error': f'Script failed with return code {result.returncode}',
+                'stderr': result.stderr
+            })
+        # Parse JSON output
+        try:
+            output_str = result.stdout.strip()
+            return jsonify(json.loads(output_str))
+        except Exception as e:
+            return jsonify({
+                'success': False,
+                'error': 'Invalid JSON output from script',
+                'raw_output': result.stdout[:500]
+            })
+    except Exception as e:
+        return jsonify({'success': False, 'error': str(e)}), 500
 # ============================================================================
 # UTILITY ENDPOINTS
 # ============================================================================
                 'POST /api/analyze-document': {
                     'description': 'Analyze and identify document type from image',
                     'fields': ['document_type', 'confidence']
+                },
+                'POST /api/analyze-documents': {
+                    'description': 'Analyze multiple documents for tampering detection and metadata (max 3)',
+                    'fields': ['tampering_results', 'metadata_results'],
+                    'body': {'image_urls': 'array of image URLs (max 3)'}
                 }
             },
             'Utility': {

requirements.txt CHANGED Viewed

@@ -11,4 +11,5 @@ pyclipper>=1.3.0
 imgaug>=0.4.0
 lmdb>=1.4.0
 tqdm>=4.65.0

 imgaug>=0.4.0
 lmdb>=1.4.0
 tqdm>=4.65.0
+exifread>=3.0.0