Skip to content

XHTMLDocHandler

dcmspec.xhtml_doc_handler.XHTMLDocHandler

Bases: DocHandler

Handler class for DICOM specifications documents in XHTML format.

Provides methods to download, cache, and parse XHTML documents, returning a BeautifulSoup DOM object. Inherits configuration and logging from DocHandler.

Note: Progress reporting via progress_observer covers both downloading and caching (writing to disk). Parsing and cache loading are typically fast and do not emit progress updates.

Source code in src/dcmspec/xhtml_doc_handler.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
class XHTMLDocHandler(DocHandler):
    """Handler class for DICOM specifications documents in XHTML format.

    Provides methods to download, cache, and parse XHTML documents, returning a BeautifulSoup DOM object.
    Inherits configuration and logging from DocHandler.

    Note:
    Progress reporting via progress_observer covers both downloading and caching (writing to disk).
    Parsing and cache loading are typically fast and do not emit progress updates.

    """

    def __init__(self, config: Optional[Config] = None, logger: Optional[logging.Logger] = None):
        """Initialize the XHTML document handler and set cache_file_name to None."""
        super().__init__(config=config, logger=logger)
        self.cache_file_name = None

    def load_document(
            self, cache_file_name: str,
            url: Optional[str] = None,
            force_download: bool = False,
            progress_observer: 'Optional[ProgressObserver]' = None,
            # BEGIN LEGACY SUPPORT: Remove for int progress callback deprecation
            progress_callback: 'Optional[Callable[[int], None]]' = None,
            # END LEGACY SUPPORT
    ) -> BeautifulSoup:
        # sourcery skip: merge-else-if-into-elif, reintroduce-else, swap-if-else-branches
        """Open and parse an XHTML file, downloading it if needed.

        Args:
            cache_file_name (str): Path to the local cached XHTML file.
            url (str, optional): URL to download the file from if not cached or if force_download is True.
            force_download (bool): If True, do not use cache and download the file from the URL.
            progress_observer (Optional[ProgressObserver]): Optional observer to report download progress.
            progress_callback (Optional[Callable[[int], None]]): [LEGACY, Deprecated] Optional callback to
                report progress as an integer percent (0-100, or -1 if indeterminate). Use progress_observer
                instead. Will be removed in a future release.

        Returns:
            BeautifulSoup: Parsed DOM.

        """
        # BEGIN LEGACY SUPPORT: Remove for int progress callback deprecation
        progress_observer = handle_legacy_callback(progress_observer, progress_callback)
        # END LEGACY SUPPORT

        # Set cache_file_name as an attribute for downstream use (e.g., in SpecFactory)
        self.cache_file_name = cache_file_name

        cache_file_path = os.path.join(self.config.get_param("cache_dir"), "standard", cache_file_name)
        need_download = force_download or (not os.path.exists(cache_file_path))
        if need_download:
            if not url:
                raise ValueError("URL must be provided to download the file.")
            cache_file_path = self.download(url, cache_file_name, progress_observer=progress_observer)
        else:
            # Also report progress when XHTML file was loaded from cache (keeping DOWNLOADING status for consistency)
            if progress_observer:
                progress_observer(Progress(100, status=ProgressStatus.DOWNLOADING))     

        # No need to report progress for parsing as, even for the largest DICOM standard XHTML file of 35 MB,
        # the parsing is fast and not a bottleneck. If future files or operations make parsing slow,
        # consider extending progress reporting here.
        return self.parse_dom(cache_file_path)

    def download(
        self,
        url: str,
        cache_file_name: str,
        progress_observer: 'Optional[ProgressObserver]' = None,
        # BEGIN LEGACY SUPPORT: Remove for int progress callback deprecation
        progress_callback: 'Optional[Callable[[int], None]]' = None
        # END LEGACY SUPPORT
    ) -> str:
        """Download and cache an XHTML file from a URL.

        Uses the base class download method, saving as UTF-8 text and cleaning ZWSP/NBSP.

        Args:
            url: The URL of the XHTML document to download.
            cache_file_name: The filename of the cached document.
            progress_observer: Optional observer to report download progress.
            progress_callback (Optional[Callable[[int], None]]): [LEGACY, Deprecated] Optional callback to
                report progress as an integer percent (0-100, or -1 if indeterminate). Use progress_observer
                instead. Will be removed in a future release.

        Returns:
            The file path where the document was saved.

        Raises:
            RuntimeError: If the download or save fails.

        """
        # BEGIN LEGACY SUPPORT: Remove for int progress callback deprecation
        progress_observer = handle_legacy_callback(progress_observer, progress_callback)
        # END LEGACY SUPPORT
        file_path = os.path.join(self.config.get_param("cache_dir"), "standard", cache_file_name)
        return super().download(url, file_path, binary=False, progress_observer=progress_observer)

    def clean_text(self, text: str) -> str:
        """Clean text content before saving.

        Removes zero-width space (ZWSP) and non-breaking space (NBSP) characters.

        Args:
            text (str): The text content to clean.

        Returns:
            str: The cleaned text.

        """
        cleaned_content = re.sub(r"\u200b", "", text)
        cleaned_content = re.sub(r"\u00a0", " ", cleaned_content)
        return cleaned_content

    def parse_dom(self, file_path: str) -> BeautifulSoup:
        """Parse a cached XHTML file into a BeautifulSoup DOM object.

        Args:
            file_path (str): Path to the cached XHTML file to parse.

        Returns:
            BeautifulSoup: The parsed DOM object.

        Raises:
            RuntimeError: If the file cannot be read or parsed.

        """
        self.logger.info(f"Reading XHTML DOM from {file_path}")
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
            # dom = BeautifulSoup(content, "html.parser")  # use python HTML parser. Fine for XHTML. Unreliable for XML.
            # dom = BeautifulSoup(content, "lxml")  # use lxml package parser. Default to HTML and generates a warning.
            dom = BeautifulSoup(content, features="xml")  # use lxml package parser. Force using XML. Safest choice.
            self.logger.info("XHTML DOM read successfully")

            return dom
        except OSError as e:
            self.logger.error(f"Failed to read file {file_path}: {e}")
            raise RuntimeError(f"Failed to read file {file_path}: {e}") from e
        except Exception as e:
            self.logger.error(f"Failed to parse XHTML file {file_path}: {e}")
            raise RuntimeError(f"Failed to parse XHTML file {file_path}: {e}") from e

    def _patch_table(self, dom: BeautifulSoup, table_id: str) -> None:
        """Patch an XHTML table to fix potential errors.

        This method does nothing and may be overridden in derived classes if patching is needed.

        Args:
            dom (BeautifulSoup): The parsed XHTML DOM object.
            table_id (str): The ID of the table to patch.

        Returns:
            None

        """
        pass

__init__(config=None, logger=None)

Initialize the XHTML document handler and set cache_file_name to None.

Source code in src/dcmspec/xhtml_doc_handler.py
34
35
36
37
def __init__(self, config: Optional[Config] = None, logger: Optional[logging.Logger] = None):
    """Initialize the XHTML document handler and set cache_file_name to None."""
    super().__init__(config=config, logger=logger)
    self.cache_file_name = None

clean_text(text)

Clean text content before saving.

Removes zero-width space (ZWSP) and non-breaking space (NBSP) characters.

PARAMETER DESCRIPTION
text

The text content to clean.

TYPE: str

RETURNS DESCRIPTION
str

The cleaned text.

TYPE: str

Source code in src/dcmspec/xhtml_doc_handler.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def clean_text(self, text: str) -> str:
    """Clean text content before saving.

    Removes zero-width space (ZWSP) and non-breaking space (NBSP) characters.

    Args:
        text (str): The text content to clean.

    Returns:
        str: The cleaned text.

    """
    cleaned_content = re.sub(r"\u200b", "", text)
    cleaned_content = re.sub(r"\u00a0", " ", cleaned_content)
    return cleaned_content

download(url, cache_file_name, progress_observer=None, progress_callback=None)

Download and cache an XHTML file from a URL.

Uses the base class download method, saving as UTF-8 text and cleaning ZWSP/NBSP.

PARAMETER DESCRIPTION
url

The URL of the XHTML document to download.

TYPE: str

cache_file_name

The filename of the cached document.

TYPE: str

progress_observer

Optional observer to report download progress.

TYPE: Optional[ProgressObserver] DEFAULT: None

progress_callback

[LEGACY, Deprecated] Optional callback to report progress as an integer percent (0-100, or -1 if indeterminate). Use progress_observer instead. Will be removed in a future release.

TYPE: Optional[Callable[[int], None]] DEFAULT: None

RETURNS DESCRIPTION
str

The file path where the document was saved.

RAISES DESCRIPTION
RuntimeError

If the download or save fails.

Source code in src/dcmspec/xhtml_doc_handler.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def download(
    self,
    url: str,
    cache_file_name: str,
    progress_observer: 'Optional[ProgressObserver]' = None,
    # BEGIN LEGACY SUPPORT: Remove for int progress callback deprecation
    progress_callback: 'Optional[Callable[[int], None]]' = None
    # END LEGACY SUPPORT
) -> str:
    """Download and cache an XHTML file from a URL.

    Uses the base class download method, saving as UTF-8 text and cleaning ZWSP/NBSP.

    Args:
        url: The URL of the XHTML document to download.
        cache_file_name: The filename of the cached document.
        progress_observer: Optional observer to report download progress.
        progress_callback (Optional[Callable[[int], None]]): [LEGACY, Deprecated] Optional callback to
            report progress as an integer percent (0-100, or -1 if indeterminate). Use progress_observer
            instead. Will be removed in a future release.

    Returns:
        The file path where the document was saved.

    Raises:
        RuntimeError: If the download or save fails.

    """
    # BEGIN LEGACY SUPPORT: Remove for int progress callback deprecation
    progress_observer = handle_legacy_callback(progress_observer, progress_callback)
    # END LEGACY SUPPORT
    file_path = os.path.join(self.config.get_param("cache_dir"), "standard", cache_file_name)
    return super().download(url, file_path, binary=False, progress_observer=progress_observer)

load_document(cache_file_name, url=None, force_download=False, progress_observer=None, progress_callback=None)

Open and parse an XHTML file, downloading it if needed.

PARAMETER DESCRIPTION
cache_file_name

Path to the local cached XHTML file.

TYPE: str

url

URL to download the file from if not cached or if force_download is True.

TYPE: str DEFAULT: None

force_download

If True, do not use cache and download the file from the URL.

TYPE: bool DEFAULT: False

progress_observer

Optional observer to report download progress.

TYPE: Optional[ProgressObserver] DEFAULT: None

progress_callback

[LEGACY, Deprecated] Optional callback to report progress as an integer percent (0-100, or -1 if indeterminate). Use progress_observer instead. Will be removed in a future release.

TYPE: Optional[Callable[[int], None]] DEFAULT: None

RETURNS DESCRIPTION
BeautifulSoup

Parsed DOM.

TYPE: BeautifulSoup

Source code in src/dcmspec/xhtml_doc_handler.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def load_document(
        self, cache_file_name: str,
        url: Optional[str] = None,
        force_download: bool = False,
        progress_observer: 'Optional[ProgressObserver]' = None,
        # BEGIN LEGACY SUPPORT: Remove for int progress callback deprecation
        progress_callback: 'Optional[Callable[[int], None]]' = None,
        # END LEGACY SUPPORT
) -> BeautifulSoup:
    # sourcery skip: merge-else-if-into-elif, reintroduce-else, swap-if-else-branches
    """Open and parse an XHTML file, downloading it if needed.

    Args:
        cache_file_name (str): Path to the local cached XHTML file.
        url (str, optional): URL to download the file from if not cached or if force_download is True.
        force_download (bool): If True, do not use cache and download the file from the URL.
        progress_observer (Optional[ProgressObserver]): Optional observer to report download progress.
        progress_callback (Optional[Callable[[int], None]]): [LEGACY, Deprecated] Optional callback to
            report progress as an integer percent (0-100, or -1 if indeterminate). Use progress_observer
            instead. Will be removed in a future release.

    Returns:
        BeautifulSoup: Parsed DOM.

    """
    # BEGIN LEGACY SUPPORT: Remove for int progress callback deprecation
    progress_observer = handle_legacy_callback(progress_observer, progress_callback)
    # END LEGACY SUPPORT

    # Set cache_file_name as an attribute for downstream use (e.g., in SpecFactory)
    self.cache_file_name = cache_file_name

    cache_file_path = os.path.join(self.config.get_param("cache_dir"), "standard", cache_file_name)
    need_download = force_download or (not os.path.exists(cache_file_path))
    if need_download:
        if not url:
            raise ValueError("URL must be provided to download the file.")
        cache_file_path = self.download(url, cache_file_name, progress_observer=progress_observer)
    else:
        # Also report progress when XHTML file was loaded from cache (keeping DOWNLOADING status for consistency)
        if progress_observer:
            progress_observer(Progress(100, status=ProgressStatus.DOWNLOADING))     

    # No need to report progress for parsing as, even for the largest DICOM standard XHTML file of 35 MB,
    # the parsing is fast and not a bottleneck. If future files or operations make parsing slow,
    # consider extending progress reporting here.
    return self.parse_dom(cache_file_path)

parse_dom(file_path)

Parse a cached XHTML file into a BeautifulSoup DOM object.

PARAMETER DESCRIPTION
file_path

Path to the cached XHTML file to parse.

TYPE: str

RETURNS DESCRIPTION
BeautifulSoup

The parsed DOM object.

TYPE: BeautifulSoup

RAISES DESCRIPTION
RuntimeError

If the file cannot be read or parsed.

Source code in src/dcmspec/xhtml_doc_handler.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def parse_dom(self, file_path: str) -> BeautifulSoup:
    """Parse a cached XHTML file into a BeautifulSoup DOM object.

    Args:
        file_path (str): Path to the cached XHTML file to parse.

    Returns:
        BeautifulSoup: The parsed DOM object.

    Raises:
        RuntimeError: If the file cannot be read or parsed.

    """
    self.logger.info(f"Reading XHTML DOM from {file_path}")
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
        # dom = BeautifulSoup(content, "html.parser")  # use python HTML parser. Fine for XHTML. Unreliable for XML.
        # dom = BeautifulSoup(content, "lxml")  # use lxml package parser. Default to HTML and generates a warning.
        dom = BeautifulSoup(content, features="xml")  # use lxml package parser. Force using XML. Safest choice.
        self.logger.info("XHTML DOM read successfully")

        return dom
    except OSError as e:
        self.logger.error(f"Failed to read file {file_path}: {e}")
        raise RuntimeError(f"Failed to read file {file_path}: {e}") from e
    except Exception as e:
        self.logger.error(f"Failed to parse XHTML file {file_path}: {e}")
        raise RuntimeError(f"Failed to parse XHTML file {file_path}: {e}") from e