doc-parser

安装量: 254
排名: #3446

安装

npx skills add https://github.com/claude-office-skills/skills --skill doc-parser

Document Parser Skill Overview This skill enables advanced document parsing using docling - IBM's state-of-the-art document understanding library. Parse complex PDFs, Word documents, and images while preserving structure, extracting tables, figures, and handling multi-column layouts. How to Use Provide the document to parse Specify what you want to extract (text, tables, figures, etc.) I'll parse it and return structured data Example prompts: "Parse this PDF and extract all tables" "Convert this academic paper to structured markdown" "Extract figures and captions from this document" "Parse this report preserving the document structure" Domain Knowledge docling Fundamentals from docling . document_converter import DocumentConverter

Initialize converter

converter

DocumentConverter ( )

Convert document

result

converter . convert ( "document.pdf" )

Access parsed content

doc

result . document print ( doc . export_to_markdown ( ) ) Supported Formats Format Extension Notes PDF .pdf Native and scanned Word .docx Full structure preserved PowerPoint .pptx Slides as sections Images .png, .jpg OCR + layout analysis HTML .html Structure preserved Basic Usage from docling . document_converter import DocumentConverter

Create converter

converter

DocumentConverter ( )

Convert single document

result

converter . convert ( "report.pdf" )

Access document

doc

result . document

Export options

markdown

doc . export_to_markdown ( ) text = doc . export_to_text ( ) json_doc = doc . export_to_dict ( ) Advanced Configuration from docling . document_converter import DocumentConverter from docling . datamodel . base_models import InputFormat from docling . datamodel . pipeline_options import PdfPipelineOptions

Configure pipeline

pipeline_options

PdfPipelineOptions ( ) pipeline_options . do_ocr = True pipeline_options . do_table_structure = True pipeline_options . table_structure_options . do_cell_matching = True

Create converter with options

converter

DocumentConverter ( allowed_formats = [ InputFormat . PDF , InputFormat . DOCX ] , pdf_backend_options = pipeline_options ) result = converter . convert ( "document.pdf" ) Document Structure

Document hierarchy

doc

result . document

Access metadata

print ( doc . name ) print ( doc . origin )

Iterate through content

for element in doc . iterate_items ( ) : print ( f"Type: { element . type } " ) print ( f"Text: { element . text } " ) if element . type == "table" : print ( f"Rows: { len ( element . data . table_cells ) } " ) Extracting Tables from docling . document_converter import DocumentConverter import pandas as pd def extract_tables ( doc_path ) : """Extract all tables from document.""" converter = DocumentConverter ( ) result = converter . convert ( doc_path ) doc = result . document tables = [ ] for element in doc . iterate_items ( ) : if element . type == "table" :

Get table data

table_data

element . export_to_dataframe ( ) tables . append ( { 'page' : element . prov [ 0 ] . page_no if element . prov else None , 'dataframe' : table_data } ) return tables

Usage

tables

extract_tables ( "report.pdf" ) for i , table in enumerate ( tables ) : print ( f"Table { i + 1 } on page { table [ 'page' ] } :" ) print ( table [ 'dataframe' ] ) Extracting Figures def extract_figures ( doc_path , output_dir ) : """Extract figures with captions.""" import os converter = DocumentConverter ( ) result = converter . convert ( doc_path ) doc = result . document figures = [ ] os . makedirs ( output_dir , exist_ok = True ) for element in doc . iterate_items ( ) : if element . type == "picture" : figure_info = { 'caption' : element . caption if hasattr ( element , 'caption' ) else None , 'page' : element . prov [ 0 ] . page_no if element . prov else None , }

Save image if available

if hasattr ( element , 'image' ) : img_path = os . path . join ( output_dir , f"figure_ { len ( figures ) + 1 } .png" ) element . image . save ( img_path ) figure_info [ 'path' ] = img_path figures . append ( figure_info ) return figures Handling Multi-column Layouts from docling . document_converter import DocumentConverter def parse_multicolumn ( doc_path ) : """Parse document with multi-column layout.""" converter = DocumentConverter ( ) result = converter . convert ( doc_path ) doc = result . document

docling automatically handles column detection

Text is returned in reading order

structured_content

[ ] for element in doc . iterate_items ( ) : content_item = { 'type' : element . type , 'text' : element . text if hasattr ( element , 'text' ) else None , 'level' : element . level if hasattr ( element , 'level' ) else None , }

Add bounding box if available

if element . prov : content_item [ 'bbox' ] = element . prov [ 0 ] . bbox content_item [ 'page' ] = element . prov [ 0 ] . page_no structured_content . append ( content_item ) return structured_content Export Formats from docling . document_converter import DocumentConverter converter = DocumentConverter ( ) result = converter . convert ( "document.pdf" ) doc = result . document

Markdown export

markdown

doc . export_to_markdown ( ) with open ( "output.md" , "w" ) as f : f . write ( markdown )

Plain text

text

doc . export_to_text ( )

JSON/dict format

json_doc

doc . export_to_dict ( )

HTML format (if supported)

html = doc.export_to_html()

Batch Processing
from
docling
.
document_converter
import
DocumentConverter
from
pathlib
import
Path
from
concurrent
.
futures
import
ThreadPoolExecutor
def
batch_parse
(
input_dir
,
output_dir
,
max_workers
=
4
)
:
"""Parse multiple documents in parallel."""
input_path
=
Path
(
input_dir
)
output_path
=
Path
(
output_dir
)
output_path
.
mkdir
(
exist_ok
=
True
)
converter
=
DocumentConverter
(
)
def
process_single
(
doc_path
)
:
try
:
result
=
converter
.
convert
(
str
(
doc_path
)
)
md
=
result
.
document
.
export_to_markdown
(
)
out_file
=
output_path
/
f"
{
doc_path
.
stem
}
.md"
with
open
(
out_file
,
'w'
)
as
f
:
f
.
write
(
md
)
return
{
'file'
:
str
(
doc_path
)
,
'status'
:
'success'
}
except
Exception
as
e
:
return
{
'file'
:
str
(
doc_path
)
,
'status'
:
'error'
,
'error'
:
str
(
e
)
}
docs
=
list
(
input_path
.
glob
(
'*.pdf'
)
)
+
list
(
input_path
.
glob
(
'*.docx'
)
)
with
ThreadPoolExecutor
(
max_workers
=
max_workers
)
as
executor
:
results
=
list
(
executor
.
map
(
process_single
,
docs
)
)
return
results
Best Practices
Use Appropriate Pipeline
Configure for your document type
Handle Large Documents
Process in chunks if needed
Verify Table Extraction
Complex tables may need review
Check OCR Quality
Enable OCR for scanned documents
Cache Results
Store parsed documents for reuse Common Patterns Academic Paper Parser def parse_academic_paper ( pdf_path ) : """Parse academic paper structure.""" converter = DocumentConverter ( ) result = converter . convert ( pdf_path ) doc = result . document paper = { 'title' : None , 'abstract' : None , 'sections' : [ ] , 'references' : [ ] , 'tables' : [ ] , 'figures' : [ ] } current_section = None for element in doc . iterate_items ( ) : text = element . text if hasattr ( element , 'text' ) else '' if element . type == 'title' : paper [ 'title' ] = text elif element . type == 'heading' : if 'abstract' in text . lower ( ) : current_section = 'abstract' elif 'reference' in text . lower ( ) : current_section = 'references' else : paper [ 'sections' ] . append ( { 'title' : text , 'content' : '' } ) current_section = 'section' elif element . type == 'paragraph' : if current_section == 'abstract' : paper [ 'abstract' ] = text elif current_section == 'section' and paper [ 'sections' ] : paper [ 'sections' ] [ - 1 ] [ 'content' ] += text + '\n' elif element . type == 'table' : paper [ 'tables' ] . append ( { 'caption' : element . caption if hasattr ( element , 'caption' ) else None , 'data' : element . export_to_dataframe ( ) if hasattr ( element , 'export_to_dataframe' ) else None } ) return paper Report to Structured Data def parse_business_report ( doc_path ) : """Parse business report into structured format.""" converter = DocumentConverter ( ) result = converter . convert ( doc_path ) doc = result . document report = { 'metadata' : { 'title' : None , 'date' : None , 'author' : None } , 'executive_summary' : None , 'sections' : [ ] , 'key_metrics' : [ ] , 'recommendations' : [ ] }

Parse document structure

for element in doc . iterate_items ( ) :

Implement parsing logic based on document structure

pass return report Examples Example 1: Parse Financial Report from docling . document_converter import DocumentConverter def parse_financial_report ( pdf_path ) : """Extract structured data from financial report.""" converter = DocumentConverter ( ) result = converter . convert ( pdf_path ) doc = result . document financial_data = { 'income_statement' : None , 'balance_sheet' : None , 'cash_flow' : None , 'notes' : [ ] }

Extract tables

tables

[ ] for element in doc . iterate_items ( ) : if element . type == 'table' : table_df = element . export_to_dataframe ( )

Identify table type

if 'revenue' in str ( table_df ) . lower ( ) or 'income' in str ( table_df ) . lower ( ) : financial_data [ 'income_statement' ] = table_df elif 'asset' in str ( table_df ) . lower ( ) or 'liabilities' in str ( table_df ) . lower ( ) : financial_data [ 'balance_sheet' ] = table_df elif 'cash' in str ( table_df ) . lower ( ) : financial_data [ 'cash_flow' ] = table_df else : tables . append ( table_df )

Extract markdown for notes

financial_data [ 'markdown' ] = doc . export_to_markdown ( ) return financial_data report = parse_financial_report ( 'annual_report.pdf' ) print ( "Income Statement:" ) print ( report [ 'income_statement' ] ) Example 2: Technical Documentation Parser from docling . document_converter import DocumentConverter def parse_technical_docs ( doc_path ) : """Parse technical documentation.""" converter = DocumentConverter ( ) result = converter . convert ( doc_path ) doc = result . document documentation = { 'title' : None , 'version' : None , 'sections' : [ ] , 'code_blocks' : [ ] , 'diagrams' : [ ] } current_section = None for element in doc . iterate_items ( ) : if element . type == 'title' : documentation [ 'title' ] = element . text elif element . type == 'heading' : current_section = { 'title' : element . text , 'level' : element . level if hasattr ( element , 'level' ) else 1 , 'content' : [ ] } documentation [ 'sections' ] . append ( current_section ) elif element . type == 'code' : if current_section : current_section [ 'content' ] . append ( { 'type' : 'code' , 'content' : element . text } ) documentation [ 'code_blocks' ] . append ( element . text ) elif element . type == 'picture' : documentation [ 'diagrams' ] . append ( { 'page' : element . prov [ 0 ] . page_no if element . prov else None , 'caption' : element . caption if hasattr ( element , 'caption' ) else None } ) return documentation docs = parse_technical_docs ( 'api_documentation.pdf' ) print ( f"Title: { docs [ 'title' ] } " ) print ( f"Sections: { len ( docs [ 'sections' ] ) } " ) Example 3: Contract Analysis from docling . document_converter import DocumentConverter def analyze_contract ( pdf_path ) : """Parse contract document for key clauses.""" converter = DocumentConverter ( ) result = converter . convert ( pdf_path ) doc = result . document contract = { 'parties' : [ ] , 'clauses' : [ ] , 'dates' : [ ] , 'amounts' : [ ] , 'full_text' : doc . export_to_text ( ) } import re

Extract dates

date_pattern

r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b' contract [ 'dates' ] = re . findall ( date_pattern , contract [ 'full_text' ] , re . IGNORECASE )

Extract monetary amounts

amount_pattern

r'\$[\d,]+(?:.\d{2})?|\b\d+(?:,\d{3})(?:.\d{2})?\s(?:USD|dollars)\b' contract [ 'amounts' ] = re . findall ( amount_pattern , contract [ 'full_text' ] , re . IGNORECASE )

Parse sections as clauses

for element in doc . iterate_items ( ) : if element . type == 'heading' : contract [ 'clauses' ] . append ( { 'title' : element . text , 'content' : '' } ) elif element . type == 'paragraph' and contract [ 'clauses' ] : contract [ 'clauses' ] [ - 1 ] [ 'content' ] += element . text + '\n' return contract contract_data = analyze_contract ( 'agreement.pdf' ) print ( f"Key dates: { contract_data [ 'dates' ] } " ) print ( f"Amounts: { contract_data [ 'amounts' ] } " ) Limitations Very large documents may require chunking Handwritten content needs OCR preprocessing Complex nested tables may need manual review Some PDF types (encrypted) not supported GPU recommended for best performance Installation pip install docling

For full functionality

pip install docling [ all ]

For OCR support

pip install docling [ ocr ] Resources docling GitHub Documentation IBM Research Blog

返回排行榜