takomattyy commited on
Commit
3f04313
Β·
verified Β·
1 Parent(s): a08bca9

Upload 21 files

Browse files

updated to add analyze_documents

Files changed (4) hide show
  1. README.md +29 -1
  2. analyze_documents.py +283 -0
  3. app.py +49 -0
  4. requirements.txt +1 -0
README.md CHANGED
@@ -35,6 +35,7 @@ Philippine ID and Document OCR Extraction Service using PaddleOCR
35
 
36
  ### Additional Features
37
  - **Document Analysis** - Automatic document type identification
 
38
 
39
  ## πŸš€ Quick Start
40
 
@@ -63,8 +64,9 @@ All extraction endpoints accept POST requests with the following format:
63
  - `POST /api/extract-police-clearance` - Extract Police Clearance
64
  - `POST /api/extract-tesda` - Extract TESDA Certificate
65
 
66
- #### Analysis Endpoint
67
  - `POST /api/analyze-document` - Identify document type
 
68
 
69
  #### Utility Endpoints
70
  - `GET /health` - Health check
@@ -94,6 +96,32 @@ print(result)
94
  # "full_name": "Juan Dela Cruz",
95
  # "birth_date": "1990-01-15"
96
  # }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  ```
98
 
99
  ### cURL Example
 
35
 
36
  ### Additional Features
37
  - **Document Analysis** - Automatic document type identification
38
+ - **Document Tampering Detection** - Analyze multiple documents for tampering using Error Level Analysis (ELA) and metadata inspection
39
 
40
  ## πŸš€ Quick Start
41
 
 
64
  - `POST /api/extract-police-clearance` - Extract Police Clearance
65
  - `POST /api/extract-tesda` - Extract TESDA Certificate
66
 
67
+ #### Analysis Endpoints
68
  - `POST /api/analyze-document` - Identify document type
69
+ - `POST /api/analyze-documents` - Analyze multiple documents for tampering (max 3)
70
 
71
  #### Utility Endpoints
72
  - `GET /health` - Health check
 
96
  # "full_name": "Juan Dela Cruz",
97
  # "birth_date": "1990-01-15"
98
  # }
99
+
100
+ # Analyze multiple documents for tampering
101
+ response = requests.post(
102
+ 'https://YOUR-SPACE.hf.space/api/analyze-documents',
103
+ json={'image_urls': [
104
+ 'https://example.com/id1.jpg',
105
+ 'https://example.com/id2.jpg'
106
+ ]}
107
+ )
108
+
109
+ tampering_result = response.json()
110
+ print(tampering_result)
111
+
112
+ # Expected output:
113
+ # {
114
+ # "success": true,
115
+ # "total_documents": 2,
116
+ # "results": [
117
+ # {
118
+ # "document_id": "doc_1",
119
+ # "tampering_results": {"tampered": "False", "brightness_ratio": 0.015},
120
+ # "metadata_results": {"result": "success", "message": "..."}
121
+ # },
122
+ # ...
123
+ # ]
124
+ # }
125
  ```
126
 
127
  ### cURL Example
analyze_documents.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, json, requests, exifread, numpy as np
2
+ from PIL import Image, ImageChops, ImageEnhance
3
+ from io import BytesIO
4
+ import os
5
+ import uuid
6
+
7
+ software_list = ['photoshop', 'lightroom', 'gimp', 'paint', 'paint.net', 'paintshop pro', 'paintshop pro x', 'paintshop pro x2', 'paintshop pro x3', 'paintshop pro x4', 'paintshop pro x5', 'paintshop pro x6', 'paintshop pro x7', 'paintshop pro x8', 'paintshop pro x9', 'paintshop pro x10']
8
+
9
+ def download_image(url, output_path=None):
10
+ if output_path is None:
11
+ # Generate unique filename to avoid conflicts
12
+ unique_id = str(uuid.uuid4()).replace('-', '')[:8] # Fixed UUID handling
13
+ output_path = f'temp_image_{unique_id}.jpg'
14
+
15
+ print(f"DOCUMENT Starting download for URL: {url}", file=sys.stderr)
16
+
17
+ # Handle URL that might be a JSON string
18
+ if url.startswith('{') or url.startswith('['):
19
+ try:
20
+ url_data = json.loads(url)
21
+ print(f"DOCUMENT Parsed JSON URL data: {url_data}", file=sys.stderr)
22
+ if isinstance(url_data, dict) and 'url' in url_data:
23
+ url = url_data['url']
24
+ print(f"DOCUMENT Extracted URL from dict: {url}", file=sys.stderr)
25
+ elif isinstance(url_data, list) and len(url_data) > 0:
26
+ url = url_data[0] if isinstance(url_data[0], str) else url_data[0].get('url', '')
27
+ print(f"DOCUMENT Extracted URL from list: {url}", file=sys.stderr)
28
+ except json.JSONDecodeError as e:
29
+ print(f"DOCUMENT Error parsing URL JSON: {url}, Error: {str(e)}", file=sys.stderr)
30
+ return None
31
+
32
+ if not url or url == '':
33
+ print(f"DOCUMENT Empty URL after processing", file=sys.stderr)
34
+ return None
35
+
36
+ try:
37
+ print(f"DOCUMENT Attempting to download from: {url}", file=sys.stderr)
38
+ response = requests.get(url, timeout=30) # Added timeout
39
+ response.raise_for_status()
40
+ image_data = response.content
41
+
42
+ print(f"DOCUMENT Downloaded image from {url}", file=sys.stderr)
43
+ print(f"DOCUMENT Image data size: {len(image_data)} bytes", file=sys.stderr)
44
+
45
+ # Now process the image
46
+ image = Image.open(BytesIO(image_data))
47
+ print(f"DOCUMENT Original image format: {image.format}", file=sys.stderr)
48
+ print(f"DOCUMENT Original image mode: {image.mode}", file=sys.stderr)
49
+
50
+ # Convert to RGB if necessary (JPEG doesn't support alpha channel)
51
+ if image.mode == 'RGBA':
52
+ # Create a white background
53
+ background = Image.new('RGB', image.size, (255, 255, 255))
54
+ # Paste the image onto the background
55
+ background.paste(image, mask=image.split()[-1]) # Use alpha channel as mask
56
+ image = background
57
+ print(f"DOCUMENT Converted RGBA to RGB", file=sys.stderr)
58
+ elif image.mode != 'RGB':
59
+ image = image.convert('RGB')
60
+ print(f"DOCUMENT Converted {image.mode} to RGB", file=sys.stderr)
61
+
62
+ # Save as JPG (simplified - don't try to preserve EXIF for now)
63
+ image.save(output_path, 'JPEG', quality=95)
64
+ print(f"DOCUMENT Saved image to {output_path}", file=sys.stderr)
65
+
66
+ return output_path
67
+ except requests.exceptions.RequestException as e:
68
+ print(f"DOCUMENT Request error downloading image: {str(e)}", file=sys.stderr)
69
+ return None
70
+ except Exception as e:
71
+ print(f"DOCUMENT Error downloading image: {str(e)}", file=sys.stderr)
72
+ return None
73
+
74
+ def perform_error_level_analysis(image_path, output_path=None, quality=90):
75
+ if output_path is None:
76
+ # Generate unique filename for ELA output
77
+ base_name = os.path.splitext(image_path)[0]
78
+ output_path = f"{base_name}_ELA.png"
79
+
80
+ print(f"DOCUMENT TAMPERING Starting Error Level Analysis...", file=sys.stderr)
81
+
82
+ original_image = Image.open(image_path)
83
+
84
+ # Convert RGBA to RGB if necessary (JPEG doesn't support alpha channel)
85
+ if original_image.mode == 'RGBA':
86
+ # Create a white background
87
+ background = Image.new('RGB', original_image.size, (255, 255, 255))
88
+ # Paste the image onto the background
89
+ background.paste(original_image, mask=original_image.split()[-1]) # Use alpha channel as mask
90
+ original_image = background
91
+ elif original_image.mode != 'RGB':
92
+ original_image = original_image.convert('RGB')
93
+
94
+ # Use unique temp file for each document
95
+ temp_path = f"temp_ela_{str(uuid.uuid4()).replace('-', '')[:8]}.jpg" # Fixed UUID handling
96
+ original_image.save(temp_path, "JPEG", quality=quality)
97
+ resaved_image = Image.open(temp_path)
98
+
99
+ difference = ImageChops.difference(original_image, resaved_image)
100
+ difference = ImageEnhance.Brightness(difference).enhance(10)
101
+ difference.save(output_path)
102
+
103
+ # Clean up temp file
104
+ try:
105
+ os.remove(temp_path)
106
+ except:
107
+ pass
108
+
109
+ print(f"DOCUMENT TAMPERING ELA analysis completed, saved to {output_path}", file=sys.stderr)
110
+ return output_path
111
+
112
+ def detect_tampering(ela_path, threshold=100):
113
+ print(f"DOCUMENT TAMPERING Analyzing ELA results...", file=sys.stderr)
114
+
115
+ ela_image = Image.open(ela_path)
116
+ ela_array = np.array(ela_image)
117
+
118
+ bright_pixels = np.sum(ela_array > threshold)
119
+ total_pixels = ela_array.size
120
+
121
+ brightness_ratio = bright_pixels / total_pixels
122
+
123
+ print(f"DOCUMENT TAMPERING Brightness ratio: {brightness_ratio:.4f}", file=sys.stderr)
124
+
125
+ if brightness_ratio < 0.02:
126
+ return {
127
+ "tampered": "False",
128
+ "brightness_ratio": round(float(brightness_ratio), 4)
129
+ }
130
+ elif brightness_ratio > 0.02 and brightness_ratio <= 0.05:
131
+ return {
132
+ "tampered": "Investigation Required",
133
+ "brightness_ratio": round(float(brightness_ratio), 4)
134
+ }
135
+ elif brightness_ratio > 0.05:
136
+ return {
137
+ "tampered": "True",
138
+ "brightness_ratio": round(float(brightness_ratio), 4)
139
+ }
140
+
141
+ def analyze_metadata(image_path):
142
+ print(f"DOCUMENT METADATA Starting metadata analysis...", file=sys.stderr)
143
+
144
+ try:
145
+ with open(image_path, 'rb') as f:
146
+ tags = exifread.process_file(f)
147
+ metadata = {tag: str(tags.get(tag)) for tag in tags.keys()}
148
+
149
+ # Debug: Print what we found
150
+ print(f"DOCUMENT METADATA Found {len(metadata)} metadata tags", file=sys.stderr)
151
+ if metadata:
152
+ print(f"DOCUMENT METADATA Metadata keys: {list(metadata.keys())}", file=sys.stderr)
153
+
154
+ except Exception as e:
155
+ print(f"DOCUMENT METADATA Error reading metadata: {str(e)}", file=sys.stderr)
156
+ return {
157
+ "result": "error",
158
+ "message": f"Error reading metadata: {str(e)}",
159
+ "metadata": {}
160
+ }
161
+
162
+ if not metadata:
163
+ print(f"DOCUMENT METADATA No metadata found in {image_path}", file=sys.stderr)
164
+ return {
165
+ "result": "no metadata",
166
+ "message": "No metadata found in image.",
167
+ "metadata": metadata
168
+ }
169
+
170
+ # Check for timestamp metadata
171
+ has_timestamp = any(key in metadata for key in ['EXIF DateTimeOriginal', 'Image DateTime', 'EXIF DateTime'])
172
+
173
+ # Check for editing software
174
+ software_used = metadata.get('Image Software', '').lower()
175
+
176
+ # Check for camera information
177
+ has_camera_info = any(key in metadata for key in ['Image Make', 'Image Model', 'EXIF Make', 'EXIF Model'])
178
+
179
+ # Provide more detailed analysis
180
+ analysis_summary = []
181
+ if has_timestamp:
182
+ analysis_summary.append("Timestamp found")
183
+ if has_camera_info:
184
+ analysis_summary.append("Camera information found")
185
+ if software_used:
186
+ analysis_summary.append(f"Software detected: {software_used}")
187
+
188
+ if not has_timestamp and not has_camera_info and not software_used:
189
+ return {
190
+ "result": "minimal metadata",
191
+ "message": "Image contains minimal metadata (no timestamp, camera info, or editing software detected)",
192
+ "metadata": metadata,
193
+ "analysis": "Image appears to be legitimate with no signs of editing"
194
+ }
195
+
196
+ if any(term in software_used for term in software_list):
197
+ return {
198
+ "result": "edited",
199
+ "message": f"Image appears to have been edited with software: {software_used}",
200
+ "metadata": metadata,
201
+ "analysis": "Suspicious editing software detected"
202
+ }
203
+
204
+ return {
205
+ "result": "success",
206
+ "message": f"Successfully analyzed metadata: {', '.join(analysis_summary)}",
207
+ "metadata": metadata,
208
+ "analysis": "Image appears to be legitimate with no signs of editing"
209
+ }
210
+
211
+ def analyze_single_document(image_url, document_id=None):
212
+ """Analyze a single document and return results"""
213
+ try:
214
+ # Generate unique filename for this document
215
+ unique_id = str(uuid.uuid4()).replace('-', '')[:8] # Fixed UUID handling
216
+ image_path = download_image(image_url, f'temp_image_{unique_id}.jpg')
217
+
218
+ if image_path is None:
219
+ return {
220
+ "document_id": document_id or unique_id,
221
+ "image_url": image_url,
222
+ "success": False,
223
+ "error": "Failed to download image"
224
+ }
225
+
226
+ # ELA and Tampering
227
+ ela_path = perform_error_level_analysis(image_path, f'ELA_{unique_id}.png')
228
+ tampering_results = detect_tampering(ela_path)
229
+
230
+ # Metadata
231
+ metadata_results = analyze_metadata(image_path)
232
+
233
+ # Clean up temporary files
234
+ try:
235
+ os.remove(image_path)
236
+ os.remove(ela_path)
237
+ except:
238
+ pass
239
+
240
+ return {
241
+ "document_id": document_id or unique_id,
242
+ "image_url": image_url,
243
+ "success": True,
244
+ "tampering_results": tampering_results,
245
+ "metadata_results": metadata_results
246
+ }
247
+
248
+ except Exception as e:
249
+ return {
250
+ "document_id": document_id or "unknown",
251
+ "image_url": image_url,
252
+ "success": False,
253
+ "error": str(e)
254
+ }
255
+
256
+ # Main function for batch processing
257
+ if __name__ == "__main__":
258
+ if len(sys.argv) < 2:
259
+ print(json.dumps({"error": "No image URLs provided"}))
260
+ sys.exit(1)
261
+
262
+ # Get all URLs from command line arguments
263
+ image_urls = sys.argv[1:]
264
+
265
+ # Limit to 3 documents as requested
266
+ if len(image_urls) > 3:
267
+ print(json.dumps({"error": "Maximum 3 documents allowed"}))
268
+ sys.exit(1)
269
+
270
+ results = []
271
+
272
+ # Process each document
273
+ for i, url in enumerate(image_urls):
274
+ print(f"Processing document {i+1}/{len(image_urls)}: {url}", file=sys.stderr)
275
+ result = analyze_single_document(url, f"doc_{i+1}")
276
+ results.append(result)
277
+
278
+ # Return combined results
279
+ print(json.dumps({
280
+ "success": True,
281
+ "total_documents": len(results),
282
+ "results": results
283
+ }))
app.py CHANGED
@@ -282,6 +282,50 @@ def api_analyze_document():
282
  except Exception as e:
283
  return jsonify({'success': False, 'error': str(e)}), 500
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  # ============================================================================
286
  # UTILITY ENDPOINTS
287
  # ============================================================================
@@ -355,6 +399,11 @@ def index():
355
  'POST /api/analyze-document': {
356
  'description': 'Analyze and identify document type from image',
357
  'fields': ['document_type', 'confidence']
 
 
 
 
 
358
  }
359
  },
360
  'Utility': {
 
282
  except Exception as e:
283
  return jsonify({'success': False, 'error': str(e)}), 500
284
 
285
+ @app.route('/api/analyze-documents', methods=['POST'])
286
+ def api_analyze_documents():
287
+ """Analyze multiple documents for tampering detection and metadata"""
288
+ try:
289
+ data = request.json
290
+ image_urls = data.get('image_urls', [])
291
+
292
+ if not image_urls:
293
+ return jsonify({'error': 'Missing image_urls array'}), 400
294
+
295
+ if len(image_urls) > 3:
296
+ return jsonify({'error': 'Maximum 3 documents allowed'}), 400
297
+
298
+ # Run analyze_documents.py with multiple URLs
299
+ cmd = [sys.executable, 'analyze_documents.py'] + image_urls
300
+ result = subprocess.run(
301
+ cmd,
302
+ capture_output=True,
303
+ text=True,
304
+ timeout=300,
305
+ cwd=os.getcwd()
306
+ )
307
+
308
+ if result.returncode != 0:
309
+ return jsonify({
310
+ 'success': False,
311
+ 'error': f'Script failed with return code {result.returncode}',
312
+ 'stderr': result.stderr
313
+ })
314
+
315
+ # Parse JSON output
316
+ try:
317
+ output_str = result.stdout.strip()
318
+ return jsonify(json.loads(output_str))
319
+ except Exception as e:
320
+ return jsonify({
321
+ 'success': False,
322
+ 'error': 'Invalid JSON output from script',
323
+ 'raw_output': result.stdout[:500]
324
+ })
325
+
326
+ except Exception as e:
327
+ return jsonify({'success': False, 'error': str(e)}), 500
328
+
329
  # ============================================================================
330
  # UTILITY ENDPOINTS
331
  # ============================================================================
 
399
  'POST /api/analyze-document': {
400
  'description': 'Analyze and identify document type from image',
401
  'fields': ['document_type', 'confidence']
402
+ },
403
+ 'POST /api/analyze-documents': {
404
+ 'description': 'Analyze multiple documents for tampering detection and metadata (max 3)',
405
+ 'fields': ['tampering_results', 'metadata_results'],
406
+ 'body': {'image_urls': 'array of image URLs (max 3)'}
407
  }
408
  },
409
  'Utility': {
requirements.txt CHANGED
@@ -11,4 +11,5 @@ pyclipper>=1.3.0
11
  imgaug>=0.4.0
12
  lmdb>=1.4.0
13
  tqdm>=4.65.0
 
14
 
 
11
  imgaug>=0.4.0
12
  lmdb>=1.4.0
13
  tqdm>=4.65.0
14
+ exifread>=3.0.0
15