Structured Output Extractor

Extract reliable, typed data from LLM responses.

Core Workflow Define schema: Create data structure Choose method: Function calling vs prompting Generate response: Call LLM with structure Validate output: Parse and verify Handle errors: Retry or fallback Methods Comparison Method Reliability Flexibility Best For OpenAI JSON Mode High Medium Simple JSON Function Calling Very High High Complex schemas Instructor Very High High Python/TS apps Zod + Prompting Medium High Custom parsing OpenAI Structured Outputs JSON Mode // extractors/json-mode.ts import OpenAI from 'openai';

const openai = new OpenAI();

interface ExtractedData { name: string; email: string; phone?: string; company?: string; }

export async function extractContactInfo(text: string): Promise { const response = await openai.chat.completions.create({ model: 'gpt-4-turbo-preview', response_format: { type: 'json_object' }, messages: [ { role: 'system', content: Extract contact information from text. Return JSON with: { "name": "string", "email": "string", "phone": "string or null", "company": "string or null" }, }, { role: 'user', content: text }, ], });

return JSON.parse(response.choices[0].message.content!); }

Structured Outputs (Beta) // extractors/structured.ts import OpenAI from 'openai'; import { z } from 'zod'; import { zodResponseFormat } from 'openai/helpers/zod';

const ContactSchema = z.object({ name: z.string().describe('Full name of the contact'), email: z.string().email().describe('Email address'), phone: z.string().nullable().describe('Phone number if available'), company: z.string().nullable().describe('Company name if mentioned'), role: z.string().nullable().describe('Job title or role'), });

type Contact = z.infer;

export async function extractContact(text: string): Promise { const response = await openai.beta.chat.completions.parse({ model: 'gpt-4o-2024-08-06', messages: [ { role: 'system', content: 'Extract contact information from the provided text.', }, { role: 'user', content: text }, ], response_format: zodResponseFormat(ContactSchema, 'contact'), });

return response.choices[0].message.parsed!; }

Function Calling Define Functions // extractors/function-calling.ts import OpenAI from 'openai';

const openai = new OpenAI();

const functions = [ { name: 'extract_entities', description: 'Extract named entities from text', parameters: { type: 'object', properties: { people: { type: 'array', items: { type: 'object', properties: { name: { type: 'string' }, role: { type: 'string' }, organization: { type: 'string' }, }, required: ['name'], }, }, organizations: { type: 'array', items: { type: 'object', properties: { name: { type: 'string' }, type: { type: 'string', enum: ['company', 'nonprofit', 'government', 'other'] }, }, required: ['name'], }, }, locations: { type: 'array', items: { type: 'object', properties: { name: { type: 'string' }, type: { type: 'string', enum: ['city', 'country', 'address', 'other'] }, }, required: ['name'], }, }, dates: { type: 'array', items: { type: 'object', properties: { text: { type: 'string' }, normalized: { type: 'string', format: 'date' }, }, required: ['text'], }, }, }, required: ['people', 'organizations', 'locations', 'dates'], }, }, ];

export async function extractEntities(text: string) { const response = await openai.chat.completions.create({ model: 'gpt-4-turbo-preview', messages: [ { role: 'system', content: 'Extract all named entities from the provided text.', }, { role: 'user', content: text }, ], functions, function_call: { name: 'extract_entities' }, });

const functionCall = response.choices[0].message.function_call; return JSON.parse(functionCall!.arguments); }

Tool Use Pattern // extractors/tools.ts import OpenAI from 'openai';

const tools: OpenAI.Chat.ChatCompletionTool[] = [ { type: 'function', function: { name: 'classify_intent', description: 'Classify the user intent from their message', parameters: { type: 'object', properties: { intent: { type: 'string', enum: ['question', 'complaint', 'feedback', 'request', 'other'], }, confidence: { type: 'number', minimum: 0, maximum: 1, }, entities: { type: 'object', properties: { product: { type: 'string' }, issue: { type: 'string' }, sentiment: { type: 'string', enum: ['positive', 'negative', 'neutral'] }, }, }, suggestedAction: { type: 'string', }, }, required: ['intent', 'confidence'], }, }, }, ];

export async function classifyMessage(message: string) { const response = await openai.chat.completions.create({ model: 'gpt-4-turbo-preview', messages: [{ role: 'user', content: message }], tools, tool_choice: { type: 'function', function: { name: 'classify_intent' } }, });

const toolCall = response.choices[0].message.tool_calls?.[0]; return JSON.parse(toolCall!.function.arguments); }

Zod Schema Validation With Instructor-like Pattern // extractors/zod-extractor.ts import { z } from 'zod'; import OpenAI from 'openai';

const openai = new OpenAI();

// Define schema const ProductReviewSchema = z.object({ productName: z.string(), rating: z.number().min(1).max(5), pros: z.array(z.string()), cons: z.array(z.string()), summary: z.string().max(200), wouldRecommend: z.boolean(), sentiment: z.enum(['positive', 'negative', 'mixed', 'neutral']), });

type ProductReview = z.infer;

// Generate JSON schema from Zod function zodToJsonSchema(schema: z.ZodObject) { // Simplified - use zod-to-json-schema in production const shape = schema.shape; const properties: Record = {}; const required: string[] = [];

for (const [key, value] of Object.entries(shape)) { const zodType = value as z.ZodTypeAny; properties[key] = zodTypeToJson(zodType); if (!zodType.isOptional()) { required.push(key); } }

return { type: 'object', properties, required }; }

export async function extractReview(reviewText: string): Promise { const schema = zodToJsonSchema(ProductReviewSchema);

const response = await openai.chat.completions.create({ model: 'gpt-4-turbo-preview', response_format: { type: 'json_object' }, messages: [ { role: 'system', content: Extract a structured product review. Return JSON matching this schema: ${JSON.stringify(schema, null, 2)}, }, { role: 'user', content: reviewText }, ], });

const data = JSON.parse(response.choices[0].message.content!);

// Validate with Zod return ProductReviewSchema.parse(data); }

With Retry on Validation Error // extractors/retry.ts export async function extractWithRetry( schema: z.ZodSchema, prompt: string, text: string, maxRetries = 3 ): Promise { let lastError: Error | null = null;

for (let attempt = 0; attempt < maxRetries; attempt++) { try { const response = await openai.chat.completions.create({ model: 'gpt-4-turbo-preview', response_format: { type: 'json_object' }, messages: [ { role: 'system', content: prompt }, { role: 'user', content: text }, // Add error context on retry ...(lastError ? [ { role: 'user' as const, content: Previous attempt failed validation: ${lastError.message}. Please fix., }, ] : []), ], });

  const data = JSON.parse(response.choices[0].message.content!);
  return schema.parse(data);
} catch (error) {
  if (error instanceof z.ZodError) {
    lastError = new Error(
      error.errors.map((e) => `${e.path.join('.')}: ${e.message}`).join(', ')
    );
  } else {
    throw error;
  }
}

}

throw new Error(Failed after ${maxRetries} attempts: ${lastError?.message}); }

Complex Extraction Patterns Hierarchical Data // extractors/hierarchical.ts const DocumentSchema = z.object({ title: z.string(), authors: z.array( z.object({ name: z.string(), affiliation: z.string().optional(), }) ), abstract: z.string(), sections: z.array( z.object({ heading: z.string(), content: z.string(), subsections: z .array( z.object({ heading: z.string(), content: z.string(), }) ) .optional(), }) ), references: z.array( z.object({ authors: z.array(z.string()), title: z.string(), year: z.number(), source: z.string().optional(), }) ), keywords: z.array(z.string()), });

Multi-step Extraction // extractors/multistep.ts export async function extractComplex(document: string) { // Step 1: Extract structure const structure = await extract( z.object({ sections: z.array(z.string()), hasReferences: z.boolean(), }), 'Identify the document structure', document );

// Step 2: Extract each section const sections = await Promise.all( structure.sections.map((section) => extract( z.object({ heading: z.string(), summary: z.string(), keyPoints: z.array(z.string()), }), Extract details from section: ${section}, document ) ) );

// Step 3: Extract references if present let references = []; if (structure.hasReferences) { references = await extract( z.array( z.object({ authors: z.array(z.string()), title: z.string(), year: z.number(), }) ), 'Extract all references', document ); }

return { sections, references }; }

Streaming Structured Output // extractors/streaming.ts import { zodToJsonSchema } from 'zod-to-json-schema';

export async function* streamExtract( schema: z.ZodSchema, prompt: string, text: string ): AsyncGenerator<{ partial: any; complete: boolean }> { const jsonSchema = zodToJsonSchema(schema);

const stream = await openai.chat.completions.create({ model: 'gpt-4-turbo-preview', response_format: { type: 'json_object' }, messages: [ { role: 'system', content: ${prompt}\n\nReturn JSON matching: ${JSON.stringify(jsonSchema)}, }, { role: 'user', content: text }, ], stream: true, });

let fullContent = '';

for await (const chunk of stream) { const content = chunk.choices[0]?.delta?.content || ''; fullContent += content;

// Try to parse partial JSON
try {
  const partial = JSON.parse(fullContent);
  yield { partial, complete: false };
} catch {
  // JSON not complete yet
}

}

// Final parse and validate const final = JSON.parse(fullContent); const validated = schema.parse(final); yield { partial: validated, complete: true }; }

Error Handling // extractors/error-handling.ts export class ExtractionError extends Error { constructor( message: string, public readonly raw: string, public readonly validationErrors?: z.ZodError ) { super(message); this.name = 'ExtractionError'; } }

export async function safeExtract( schema: z.ZodSchema, prompt: string, text: string ): Promise<{ success: true; data: T } | { success: false; error: ExtractionError }> { try { const data = await extractWithRetry(schema, prompt, text); return { success: true, data }; } catch (error) { if (error instanceof z.ZodError) { return { success: false, error: new ExtractionError('Validation failed', text, error), }; } return { success: false, error: new ExtractionError( error instanceof Error ? error.message : 'Unknown error', text ), }; } }

Best Practices Use structured outputs: OpenAI's native feature when available Define clear schemas: Descriptive field names and descriptions Validate with Zod: Type-safe runtime validation Retry on failure: With error context Use function calling: For complex schemas Handle edge cases: Null values, empty arrays Stream for UX: Partial results during extraction Test with variations: Different input formats Output Checklist

Every structured extraction should include:

Zod schema with descriptions Appropriate extraction method (JSON/function/tools) Runtime validation Retry logic with error context Error handling and typing Streaming support (if needed) Edge case handling Type-safe return values Comprehensive tests

structured-output-extractor

安装