The document classification system uses advanced AI models to automatically categorize documents based on their content, structure, and visual elements. It provides accurate classification with confidence scores and supports both single-page and multi-page document analysis.
# Create detailed classification promptdef create_custom_classification_prompt(categories): formatted_conditions = ", ".join([f'"{cat}"' for cat in categories]) prompt = f""" You are a document classification expert analyzing a document page. Your task is to classify this document into EXACTLY ONE of these categories: {formatted_conditions}. Examine the document content, structure, layout, and any visual elements to determine its type. Look for headers, titles, formatting patterns, and specific content that helps identify the document type. IMPORTANT: Your response MUST be exactly one of the provided category names. Do not add any explanation, prefix, suffix, or quotation marks. Select the category that best matches from these options: {formatted_conditions} """ return prompt# Use custom prompt for classificationcategories = ["invoice", "contract", "report", "letter"]custom_prompt = create_custom_classification_prompt(categories)
def calculate_confidence_score(classification, logprobs): """Calculate confidence score from model logprobs""" if not logprobs or not logprobs.content: return 0.5 # Default confidence # Extract relevant logprobs for the classification relevant_logprobs = [] classification_lower = classification.lower() for logprob_item in logprobs.content: if logprob_item.token and classification_lower in logprob_item.token.lower(): relevant_logprobs.append(logprob_item.logprob) if not relevant_logprobs: return 0.5 # Calculate average logprob and convert to confidence avg_logprob = sum(relevant_logprobs) / len(relevant_logprobs) confidence = min(max(math.exp(avg_logprob), 0.1), 0.99) return confidence
classification_config = { 'model': 'gpt-4-vision-preview', # Vision model to use 'max_retries': 2, # Retry attempts for failed requests 'timeout': 30, # Request timeout in seconds 'temperature': 0.1, # Model temperature for consistency 'max_tokens': 50 # Maximum tokens in response}