gmf_forge_ai_data.layout

Layout analysis module — Azure Document Intelligence integration.

Modules: document_intelligence_layout: Analyze documents using Azure Document Intelligence prebuilt-layout model and return markdown output ready for MarkdownChunker.

 1"""
 2Layout analysis module — Azure Document Intelligence integration.
 3
 4Modules:
 5    document_intelligence_layout: Analyze documents using Azure Document Intelligence
 6                                  prebuilt-layout model and return markdown output
 7                                  ready for MarkdownChunker.
 8"""
 9
10from .document_intelligence_layout import DocumentIntelligenceLayout, LayoutResult
11
12__all__ = [
13    "DocumentIntelligenceLayout",
14    "LayoutResult",
15]
class DocumentIntelligenceLayout:
 89class DocumentIntelligenceLayout:
 90    """
 91    Wraps the Azure Document Intelligence ``prebuilt-layout`` model.
 92
 93    Converts PDF, DOCX, PPTX, XLSX, and image files into structured markdown
 94    that preserves headings, tables, lists, and page structure.  The markdown
 95    output is designed to be chunked with ``MarkdownChunker``.
 96
 97    Typical usage::
 98
 99        from gmf_forge_ai_data.layout import DocumentIntelligenceLayout
100        from gmf_forge_ai_data.chunkers import MarkdownChunker
101
102        # Managed identity (omit api_key) — required for Multiservices accounts
103        layout = DocumentIntelligenceLayout(
104            endpoint="https://my-resource.cognitiveservices.azure.com",
105        )
106
107        # API key — for standalone Document Intelligence resources
108        layout = DocumentIntelligenceLayout(
109            endpoint="https://my-resource.cognitiveservices.azure.com",
110            api_key="your-api-key",
111        )
112
113        # From a local file
114        result = layout.analyze_file("annual_report.pdf")
115
116        # From raw bytes (e.g. downloaded from BlobStorageConnector)
117        result = layout.analyze_bytes(blob_content, filename="report.pdf")
118
119        # From a URL
120        result = layout.analyze_url("https://example.com/policy.pdf")
121
122        # Chunk the markdown directly
123        chunker = MarkdownChunker(max_chunk_size=1500, min_header_level=2)
124        chunks  = chunker.chunk(result.markdown, metadata=result.metadata)
125
126    Args:
127        endpoint:    Azure Document Intelligence service endpoint URL.
128        api_key:     Azure Document Intelligence API key.  When provided,
129                     ``AzureKeyCredential`` is used.  When omitted (``None``),
130                     ``DefaultAzureCredential`` is used instead — required for
131                     Cognitive Services Multiservices accounts that do not
132                     expose API keys.
133        model_id:    Model to use (default: ``prebuilt-layout``).
134        api_version: REST API version (default: ``2024-11-30``).
135        logger:      Optional ``BasicLogger`` instance for structured logging.
136
137    Raises:
138        ImportError: If ``azure-ai-documentintelligence`` is not installed, or
139                     if ``api_key`` is omitted and ``azure-identity`` is not
140                     installed.
141        ValueError:  If ``endpoint`` is empty.
142    """
143
144    _DEFAULT_MODEL = "prebuilt-layout"
145    _DEFAULT_API_VERSION = "2024-11-30"
146
147    def __init__(
148        self,
149        endpoint: str,
150        api_key: Optional[str] = None,
151        model_id: str = _DEFAULT_MODEL,
152        api_version: str = _DEFAULT_API_VERSION,
153        logger: Optional[BasicLogger] = None,
154    ) -> None:
155        if not _SDK_AVAILABLE:
156            raise ImportError(
157                "azure-ai-documentintelligence is required: "
158                "pip install azure-ai-documentintelligence"
159            )
160        if not endpoint or not endpoint.strip():
161            raise ValueError("endpoint must not be empty")
162
163        self.endpoint = endpoint.rstrip("/")
164        self.model_id = model_id
165        self.api_version = api_version
166        self.logger = logger or _logger
167
168        if api_key:
169            # Standalone Document Intelligence resource with key-based auth enabled
170            credential = AzureKeyCredential(api_key)
171            auth_method = "api_key"
172        else:
173            # Cognitive Services Multiservices account — no API key exposed.
174            # DefaultAzureCredential resolves the auth chain automatically:
175            #   In Azure (AKS, VM, App Service): managed identity / workload identity
176            #   Locally: az login  or  VS Code Azure account extension
177            if not _IDENTITY_AVAILABLE:
178                raise ImportError(
179                    "azure-identity is required when api_key is not provided: "
180                    "pip install azure-identity"
181                )
182            credential = DefaultAzureCredential()
183            auth_method = "managed_identity"
184
185        self._client = DocumentIntelligenceClient(
186            endpoint=self.endpoint,
187            credential=credential,
188            api_version=self.api_version,
189        )
190
191        self.logger.info(
192            "DocumentIntelligenceLayout initialised",
193            endpoint=self.endpoint,
194            model_id=self.model_id,
195            api_version=self.api_version,
196            auth=auth_method,
197        )
198
199    # ------------------------------------------------------------------
200    # Public API
201    # ------------------------------------------------------------------
202
203    def analyze_file(self, file_path: str | Path) -> LayoutResult:
204        """
205        Analyse a local file and return its content as markdown.
206
207        Supported file types: PDF, DOCX, PPTX, XLSX, JPEG, PNG, BMP, TIFF, HEIF.
208
209        Args:
210            file_path: Path to the local document file.
211
212        Returns:
213            :class:`LayoutResult` with markdown content and metadata.
214
215        Raises:
216            FileNotFoundError: If the file does not exist.
217            ValueError:        If the file is empty.
218        """
219        path = Path(file_path)
220        if not path.exists():
221            raise FileNotFoundError(f"File not found: {file_path}")
222        if path.stat().st_size == 0:
223            raise ValueError(f"File is empty: {file_path}")
224
225        self.logger.info("Analysing file", file=str(path), model_id=self.model_id)
226
227        with open(path, "rb") as fh:
228            content = fh.read()
229
230        result = self._analyze_bytes_content(content)
231        result.metadata.update({
232            "source": str(path.resolve()),
233            "file_name": path.name,
234        })
235        return result
236
237    def analyze_bytes(self, content: bytes, filename: str = "") -> LayoutResult:
238        """
239        Analyse raw bytes and return content as markdown.
240
241        Use this when you already have the document bytes in memory — for
242        example, content downloaded via ``BlobStorageConnector`` or
243        ``SharePointConnector``.
244
245        Args:
246            content:  Raw document bytes.
247            filename: Original filename (used for metadata only, e.g. "report.pdf").
248
249        Returns:
250            :class:`LayoutResult` with markdown content and metadata.
251
252        Raises:
253            ValueError: If ``content`` is empty.
254        """
255        if not content:
256            raise ValueError("content must not be empty")
257
258        content_hash = hashlib.sha256(content).hexdigest()[:12]
259        source = f"bytes:{content_hash}"
260
261        self.logger.info(
262            "Analysing bytes",
263            size_bytes=len(content),
264            filename=filename or "(unnamed)",
265            model_id=self.model_id,
266        )
267
268        result = self._analyze_bytes_content(content)
269        result.metadata.update({
270            "source": source,
271            "file_name": filename,
272        })
273        return result
274
275    def analyze_url(self, url: str) -> LayoutResult:
276        """
277        Analyse a document at a publicly accessible URL.
278
279        The Azure Document Intelligence service fetches the document directly
280        from the URL — the bytes never pass through your application.
281
282        Args:
283            url: Publicly accessible URL pointing to a supported document.
284
285        Returns:
286            :class:`LayoutResult` with markdown content and metadata.
287
288        Raises:
289            ValueError: If ``url`` is empty.
290        """
291        if not url or not url.strip():
292            raise ValueError("url must not be empty")
293
294        self.logger.info("Analysing URL", url=url, model_id=self.model_id)
295
296        poller = self._client.begin_analyze_document(
297            self.model_id,
298            AnalyzeDocumentRequest(url_source=url),
299            output_content_format=DocumentContentFormat.MARKDOWN,
300            features=[DocumentAnalysisFeature.QUERY_FIELDS],
301        )
302        response = poller.result()
303
304        markdown = response.content or ""
305        page_count = len(response.pages) if response.pages else 0
306
307        self.logger.info(
308            "URL analysis complete",
309            url=url,
310            page_count=page_count,
311            markdown_length=len(markdown),
312        )
313
314        return LayoutResult(
315            markdown=markdown,
316            page_count=page_count,
317            metadata={
318                "source": url,
319                "file_name": url.split("/")[-1],
320                "model_id": self.model_id,
321                "analyzed_at": datetime.now(timezone.utc).isoformat(),
322            },
323        )
324
325    # ------------------------------------------------------------------
326    # Internal helpers
327    # ------------------------------------------------------------------
328
329    def _analyze_bytes_content(self, content: bytes) -> LayoutResult:
330        """Send raw bytes to the Document Intelligence service and return LayoutResult."""
331        poller = self._client.begin_analyze_document(
332            self.model_id,
333            AnalyzeDocumentRequest(bytes_source=content),
334            output_content_format=DocumentContentFormat.MARKDOWN,
335            features=[DocumentAnalysisFeature.QUERY_FIELDS],
336        )
337        response = poller.result()
338
339        markdown = response.content or ""
340        page_count = len(response.pages) if response.pages else 0
341
342        self.logger.info(
343            "Analysis complete",
344            model_id=self.model_id,
345            page_count=page_count,
346            markdown_length=len(markdown),
347        )
348
349        return LayoutResult(
350            markdown=markdown,
351            page_count=page_count,
352            metadata={
353                "model_id": self.model_id,
354                "analyzed_at": datetime.now(timezone.utc).isoformat(),
355            },
356        )

Wraps the Azure Document Intelligence prebuilt-layout model.

Converts PDF, DOCX, PPTX, XLSX, and image files into structured markdown that preserves headings, tables, lists, and page structure. The markdown output is designed to be chunked with MarkdownChunker.

Typical usage::

from gmf_forge_ai_data.layout import DocumentIntelligenceLayout
from gmf_forge_ai_data.chunkers import MarkdownChunker

# Managed identity (omit api_key) — required for Multiservices accounts
layout = DocumentIntelligenceLayout(
    endpoint="https://my-resource.cognitiveservices.azure.com",
)

# API key — for standalone Document Intelligence resources
layout = DocumentIntelligenceLayout(
    endpoint="https://my-resource.cognitiveservices.azure.com",
    api_key="your-api-key",
)

# From a local file
result = layout.analyze_file("annual_report.pdf")

# From raw bytes (e.g. downloaded from BlobStorageConnector)
result = layout.analyze_bytes(blob_content, filename="report.pdf")

# From a URL
result = layout.analyze_url("https://example.com/policy.pdf")

# Chunk the markdown directly
chunker = MarkdownChunker(max_chunk_size=1500, min_header_level=2)
chunks  = chunker.chunk(result.markdown, metadata=result.metadata)

Args: endpoint: Azure Document Intelligence service endpoint URL. api_key: Azure Document Intelligence API key. When provided, AzureKeyCredential is used. When omitted (None), DefaultAzureCredential is used instead — required for Cognitive Services Multiservices accounts that do not expose API keys. model_id: Model to use (default: prebuilt-layout). api_version: REST API version (default: 2024-11-30). logger: Optional BasicLogger instance for structured logging.

Raises: ImportError: If azure-ai-documentintelligence is not installed, or if api_key is omitted and azure-identity is not installed. ValueError: If endpoint is empty.

DocumentIntelligenceLayout( endpoint: str, api_key: Optional[str] = None, model_id: str = 'prebuilt-layout', api_version: str = '2024-11-30', logger: Optional[gmf_forge_ai_shared_core.observability.BasicLogger] = None)
147    def __init__(
148        self,
149        endpoint: str,
150        api_key: Optional[str] = None,
151        model_id: str = _DEFAULT_MODEL,
152        api_version: str = _DEFAULT_API_VERSION,
153        logger: Optional[BasicLogger] = None,
154    ) -> None:
155        if not _SDK_AVAILABLE:
156            raise ImportError(
157                "azure-ai-documentintelligence is required: "
158                "pip install azure-ai-documentintelligence"
159            )
160        if not endpoint or not endpoint.strip():
161            raise ValueError("endpoint must not be empty")
162
163        self.endpoint = endpoint.rstrip("/")
164        self.model_id = model_id
165        self.api_version = api_version
166        self.logger = logger or _logger
167
168        if api_key:
169            # Standalone Document Intelligence resource with key-based auth enabled
170            credential = AzureKeyCredential(api_key)
171            auth_method = "api_key"
172        else:
173            # Cognitive Services Multiservices account — no API key exposed.
174            # DefaultAzureCredential resolves the auth chain automatically:
175            #   In Azure (AKS, VM, App Service): managed identity / workload identity
176            #   Locally: az login  or  VS Code Azure account extension
177            if not _IDENTITY_AVAILABLE:
178                raise ImportError(
179                    "azure-identity is required when api_key is not provided: "
180                    "pip install azure-identity"
181                )
182            credential = DefaultAzureCredential()
183            auth_method = "managed_identity"
184
185        self._client = DocumentIntelligenceClient(
186            endpoint=self.endpoint,
187            credential=credential,
188            api_version=self.api_version,
189        )
190
191        self.logger.info(
192            "DocumentIntelligenceLayout initialised",
193            endpoint=self.endpoint,
194            model_id=self.model_id,
195            api_version=self.api_version,
196            auth=auth_method,
197        )
endpoint
model_id
api_version
logger
def analyze_file( self, file_path: str | pathlib._local.Path) -> LayoutResult:
203    def analyze_file(self, file_path: str | Path) -> LayoutResult:
204        """
205        Analyse a local file and return its content as markdown.
206
207        Supported file types: PDF, DOCX, PPTX, XLSX, JPEG, PNG, BMP, TIFF, HEIF.
208
209        Args:
210            file_path: Path to the local document file.
211
212        Returns:
213            :class:`LayoutResult` with markdown content and metadata.
214
215        Raises:
216            FileNotFoundError: If the file does not exist.
217            ValueError:        If the file is empty.
218        """
219        path = Path(file_path)
220        if not path.exists():
221            raise FileNotFoundError(f"File not found: {file_path}")
222        if path.stat().st_size == 0:
223            raise ValueError(f"File is empty: {file_path}")
224
225        self.logger.info("Analysing file", file=str(path), model_id=self.model_id)
226
227        with open(path, "rb") as fh:
228            content = fh.read()
229
230        result = self._analyze_bytes_content(content)
231        result.metadata.update({
232            "source": str(path.resolve()),
233            "file_name": path.name,
234        })
235        return result

Analyse a local file and return its content as markdown.

Supported file types: PDF, DOCX, PPTX, XLSX, JPEG, PNG, BMP, TIFF, HEIF.

Args: file_path: Path to the local document file.

Returns: LayoutResult with markdown content and metadata.

Raises: FileNotFoundError: If the file does not exist. ValueError: If the file is empty.

def analyze_bytes( self, content: bytes, filename: str = '') -> LayoutResult:
237    def analyze_bytes(self, content: bytes, filename: str = "") -> LayoutResult:
238        """
239        Analyse raw bytes and return content as markdown.
240
241        Use this when you already have the document bytes in memory — for
242        example, content downloaded via ``BlobStorageConnector`` or
243        ``SharePointConnector``.
244
245        Args:
246            content:  Raw document bytes.
247            filename: Original filename (used for metadata only, e.g. "report.pdf").
248
249        Returns:
250            :class:`LayoutResult` with markdown content and metadata.
251
252        Raises:
253            ValueError: If ``content`` is empty.
254        """
255        if not content:
256            raise ValueError("content must not be empty")
257
258        content_hash = hashlib.sha256(content).hexdigest()[:12]
259        source = f"bytes:{content_hash}"
260
261        self.logger.info(
262            "Analysing bytes",
263            size_bytes=len(content),
264            filename=filename or "(unnamed)",
265            model_id=self.model_id,
266        )
267
268        result = self._analyze_bytes_content(content)
269        result.metadata.update({
270            "source": source,
271            "file_name": filename,
272        })
273        return result

Analyse raw bytes and return content as markdown.

Use this when you already have the document bytes in memory — for example, content downloaded via BlobStorageConnector or SharePointConnector.

Args: content: Raw document bytes. filename: Original filename (used for metadata only, e.g. "report.pdf").

Returns: LayoutResult with markdown content and metadata.

Raises: ValueError: If content is empty.

def analyze_url( self, url: str) -> LayoutResult:
275    def analyze_url(self, url: str) -> LayoutResult:
276        """
277        Analyse a document at a publicly accessible URL.
278
279        The Azure Document Intelligence service fetches the document directly
280        from the URL — the bytes never pass through your application.
281
282        Args:
283            url: Publicly accessible URL pointing to a supported document.
284
285        Returns:
286            :class:`LayoutResult` with markdown content and metadata.
287
288        Raises:
289            ValueError: If ``url`` is empty.
290        """
291        if not url or not url.strip():
292            raise ValueError("url must not be empty")
293
294        self.logger.info("Analysing URL", url=url, model_id=self.model_id)
295
296        poller = self._client.begin_analyze_document(
297            self.model_id,
298            AnalyzeDocumentRequest(url_source=url),
299            output_content_format=DocumentContentFormat.MARKDOWN,
300            features=[DocumentAnalysisFeature.QUERY_FIELDS],
301        )
302        response = poller.result()
303
304        markdown = response.content or ""
305        page_count = len(response.pages) if response.pages else 0
306
307        self.logger.info(
308            "URL analysis complete",
309            url=url,
310            page_count=page_count,
311            markdown_length=len(markdown),
312        )
313
314        return LayoutResult(
315            markdown=markdown,
316            page_count=page_count,
317            metadata={
318                "source": url,
319                "file_name": url.split("/")[-1],
320                "model_id": self.model_id,
321                "analyzed_at": datetime.now(timezone.utc).isoformat(),
322            },
323        )

Analyse a document at a publicly accessible URL.

The Azure Document Intelligence service fetches the document directly from the URL — the bytes never pass through your application.

Args: url: Publicly accessible URL pointing to a supported document.

Returns: LayoutResult with markdown content and metadata.

Raises: ValueError: If url is empty.

@dataclass
class LayoutResult:
66@dataclass
67class LayoutResult:
68    """
69    Result of a Document Intelligence layout analysis.
70
71    Attributes:
72        markdown:    Full document content as markdown.  Headers, tables, lists,
73                     page breaks (``<!-- PageBreak -->``) and figure captions
74                     (``<!-- FigureCaption -->``) are preserved exactly as
75                     produced by Azure Document Intelligence.  Pass directly to
76                     ``MarkdownChunker`` for header-aware chunking.
77        page_count:  Number of pages in the analysed document.
78        metadata:    Source information and analysis details:
79                     ``source``       — file path, URL, or "bytes:<hash>"
80                     ``file_name``    — basename of the source file (if known)
81                     ``model_id``     — Document Intelligence model used
82                     ``analyzed_at`` — ISO-8601 UTC timestamp of the analysis
83    """
84    markdown: str
85    page_count: int
86    metadata: Dict[str, Any] = field(default_factory=dict)

Result of a Document Intelligence layout analysis.

Attributes: markdown: Full document content as markdown. Headers, tables, lists, page breaks (<!-- PageBreak -->) and figure captions (<!-- FigureCaption -->) are preserved exactly as produced by Azure Document Intelligence. Pass directly to MarkdownChunker for header-aware chunking. page_count: Number of pages in the analysed document. metadata: Source information and analysis details: source — file path, URL, or "bytes:" file_name — basename of the source file (if known) model_id — Document Intelligence model used analyzed_at — ISO-8601 UTC timestamp of the analysis

LayoutResult(markdown: str, page_count: int, metadata: Dict[str, Any] = <factory>)
markdown: str
page_count: int
metadata: Dict[str, Any]