-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtemplate.py
More file actions
118 lines (99 loc) · 3.56 KB
/
template.py
File metadata and controls
118 lines (99 loc) · 3.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
from pathlib import Path
import logging
logging.basicConfig(level=logging.INFO)
project_name = "DocumentClassification_extraction"
list_of_files = [
# Directory structure
f"src/{project_name}/__init__.py",
f"src/{project_name}/components/__init__.py",
f"src/{project_name}/components/data_ingestion.py",
f"src/{project_name}/components/data_transformation.py",
f"src/{project_name}/components/model_trainer.py",
f"src/{project_name}/components/model_monitoring.py",
f"src/{project_name}/pipelines/__init__.py",
f"src/{project_name}/pipelines/training_pipeline.py",
f"src/{project_name}/pipelines/prediction_pipeline.py",
f"src/{project_name}/exception.py",
f"src/{project_name}/logger.py",
f"src/{project_name}/utils.py",
# Model files
"model/__init__.py",
"model/classifier_model.h5",
"model/field_extraction.py",
"ocr/__init__.py",
"ocr/ocr_engine.py",
"api/__init__.py",
"api/main.py",
"api/utils.py",
"utils/__init__.py",
"utils/preprocessing.py",
"utils/image_utils.py",
"utils/create_labels_csv.py",
# Data directories
"data/invoices/.gitkeep",
"data/id_cards/.gitkeep",
"data/certificates/.gitkeep",
"data/resumes/.gitkeep",
# Logs directory
"logs/__init__.py",
"logs/predictions.db",
# Configuration files
"main.py",
"app.py",
"requirements.txt",
"setup.py",
"Dockerfile",
"README.md"
]
for filepath in list_of_files:
filepath = Path(filepath)
filedir, filename = os.path.split(filepath)
if filedir != "":
os.makedirs(filedir, exist_ok=True)
logging.info(f"Creating directory: {filedir} for the file {filename}")
if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
with open(filepath, 'w') as f:
# Add basic starter code for key files
if filepath.name == "ocr_engine.py":
f.write("""import easyocr
import pytesseract
from PIL import Image
import io
class OCREngine:
def __init__(self):
self.easyocr_reader = easyocr.Reader(['en'])
def extract_text_easyocr(self, image_path):
result = self.easyocr_reader.readtext(image_path, detail=0)
return " ".join(result)
def extract_text_tesseract(self, image_path):
image = Image.open(image_path)
text = pytesseract.image_to_string(image)
return text
""")
elif filepath.name == "field_extraction.py":
f.write("""import re
class FieldExtractor:
def extract_invoice_fields(self, text):
name = re.search(r"Name[:\\-]?\\s*([A-Za-z ]+)", text)
date = re.search(r"Date[:\\-]?\\s*([\\d\\-\\/]+)", text)
amount = re.search(r"Amount[:\\-]?\\s*([\\d,\\.]+)", text)
return {
"name": name.group(1) if name else "",
"date": date.group(1) if date else "",
"amount": amount.group(1) if amount else ""
}
def extract_id_fields(self, text):
name = re.search(r"Name[:\\-]?\\s*([A-Za-z ]+)", text)
dob = re.search(r"DOB[:\\-]?\\s*([\\d\\-\\/]+)", text)
id_number = re.search(r"ID[:\\-]?\\s*([A-Z0-9]+)", text)
return {
"name": name.group(1) if name else "",
"dob": dob.group(1) if dob else "",
"id_number": id_number.group(1) if id_number else ""
}
""")
logging.info(f"Creating empty file: {filepath}")
else:
logging.info(f"{filename} already exists")
logging.info("Complete project structure created successfully!")