Skip to content

XHTMLDocHandler

dcmspec.xhtml_doc_handler.XHTMLDocHandler

Bases: DocHandler

Handler class for DICOM specifications documents in XHTML format.

Provides methods to download, cache, and parse XHTML documents, returning a BeautifulSoup DOM object. Inherits configuration and logging from DocHandler.

Source code in src/dcmspec/xhtml_doc_handler.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
class XHTMLDocHandler(DocHandler):
    """Handler class for DICOM specifications documents in XHTML format.

    Provides methods to download, cache, and parse XHTML documents, returning a BeautifulSoup DOM object.
    Inherits configuration and logging from DocHandler.
    """

    def __init__(self, config: Optional[Config] = None, logger: Optional[logging.Logger] = None):
        """Initialize the XHTML document handler and set cache_file_name to None."""
        super().__init__(config=config, logger=logger)
        self.cache_file_name = None

    def load_document(
            self, cache_file_name: str,
            url: Optional[str] = None,
            force_download: bool = False
    ) -> BeautifulSoup:
        """Open and parse an XHTML file, downloading it if needed.

        Args:
            cache_file_name (str): Path to the local cached XHTML file.
            url (str, optional): URL to download the file from if not cached or if force_download is True.
            force_download (bool): If True, do not use cache and download the file from the URL.

        Returns:
            BeautifulSoup: Parsed DOM.

        """
        # Set cache_file_name as an attribute for downstream use (e.g., in SpecFactory)
        self.cache_file_name = cache_file_name

        cache_file_path = os.path.join(self.config.get_param("cache_dir"), "standard", cache_file_name)
        need_download = force_download or (not os.path.exists(cache_file_path))
        if need_download:
            if not url:
                raise ValueError("URL must be provided to download the file.")
            cache_file_path = self.download(url, cache_file_name)
        return self.parse_dom(cache_file_path)

    def download(self, url: str, cache_file_name: str) -> str:
        """Download and cache an XHTML file from a URL.

        Uses the base class download method, saving as UTF-8 text and cleaning ZWSP/NBSP.

        Args:
            url: The URL of the XHTML document to download.
            cache_file_name: The filename of the cached document.

        Returns:
            The file path where the document was saved.

        Raises:
            RuntimeError: If the download or save fails.

        """
        file_path = os.path.join(self.config.get_param("cache_dir"), "standard", cache_file_name)
        return super().download(url, file_path, binary=False)

    def clean_text(self, text: str) -> str:
        """Clean text content before saving.

        Removes zero-width space (ZWSP) and non-breaking space (NBSP) characters.

        Args:
            text (str): The text content to clean.

        Returns:
            str: The cleaned text.

        """
        cleaned_content = re.sub(r"\u200b", "", text)
        cleaned_content = re.sub(r"\u00a0", " ", cleaned_content)
        return cleaned_content

    def parse_dom(self, file_path: str) -> BeautifulSoup:
        """Parse a cached XHTML file into a BeautifulSoup DOM object.

        Args:
            file_path (str): Path to the cached XHTML file to parse.

        Returns:
            BeautifulSoup: The parsed DOM object.

        Raises:
            RuntimeError: If the file cannot be read or parsed.

        """
        self.logger.info(f"Reading XHTML DOM from {file_path}")
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
            # dom = BeautifulSoup(content, "html.parser")  # use python HTML parser. Fine for XHTML. Unreliable for XML.
            # dom = BeautifulSoup(content, "lxml")  # use lxml package parser. Default to HTML and generates a warning.
            dom = BeautifulSoup(content, features="xml")  # use lxml package parser. Force using XML. Safest choice.
            self.logger.info("XHTML DOM read successfully")

            return dom
        except OSError as e:
            self.logger.error(f"Failed to read file {file_path}: {e}")
            raise RuntimeError(f"Failed to read file {file_path}: {e}") from e
        except Exception as e:
            self.logger.error(f"Failed to parse XHTML file {file_path}: {e}")
            raise RuntimeError(f"Failed to parse XHTML file {file_path}: {e}") from e

    def _patch_table(self, dom: BeautifulSoup, table_id: str) -> None:
        """Patch an XHTML table to fix potential errors.

        This method does nothing and may be overridden in derived classes if patching is needed.

        Args:
            dom (BeautifulSoup): The parsed XHTML DOM object.
            table_id (str): The ID of the table to patch.

        Returns:
            None

        """
        pass

__init__(config=None, logger=None)

Initialize the XHTML document handler and set cache_file_name to None.

Source code in src/dcmspec/xhtml_doc_handler.py
24
25
26
27
def __init__(self, config: Optional[Config] = None, logger: Optional[logging.Logger] = None):
    """Initialize the XHTML document handler and set cache_file_name to None."""
    super().__init__(config=config, logger=logger)
    self.cache_file_name = None

clean_text(text)

Clean text content before saving.

Removes zero-width space (ZWSP) and non-breaking space (NBSP) characters.

PARAMETER DESCRIPTION
text

The text content to clean.

TYPE: str

RETURNS DESCRIPTION
str

The cleaned text.

TYPE: str

Source code in src/dcmspec/xhtml_doc_handler.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def clean_text(self, text: str) -> str:
    """Clean text content before saving.

    Removes zero-width space (ZWSP) and non-breaking space (NBSP) characters.

    Args:
        text (str): The text content to clean.

    Returns:
        str: The cleaned text.

    """
    cleaned_content = re.sub(r"\u200b", "", text)
    cleaned_content = re.sub(r"\u00a0", " ", cleaned_content)
    return cleaned_content

download(url, cache_file_name)

Download and cache an XHTML file from a URL.

Uses the base class download method, saving as UTF-8 text and cleaning ZWSP/NBSP.

PARAMETER DESCRIPTION
url

The URL of the XHTML document to download.

TYPE: str

cache_file_name

The filename of the cached document.

TYPE: str

RETURNS DESCRIPTION
str

The file path where the document was saved.

RAISES DESCRIPTION
RuntimeError

If the download or save fails.

Source code in src/dcmspec/xhtml_doc_handler.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def download(self, url: str, cache_file_name: str) -> str:
    """Download and cache an XHTML file from a URL.

    Uses the base class download method, saving as UTF-8 text and cleaning ZWSP/NBSP.

    Args:
        url: The URL of the XHTML document to download.
        cache_file_name: The filename of the cached document.

    Returns:
        The file path where the document was saved.

    Raises:
        RuntimeError: If the download or save fails.

    """
    file_path = os.path.join(self.config.get_param("cache_dir"), "standard", cache_file_name)
    return super().download(url, file_path, binary=False)

load_document(cache_file_name, url=None, force_download=False)

Open and parse an XHTML file, downloading it if needed.

PARAMETER DESCRIPTION
cache_file_name

Path to the local cached XHTML file.

TYPE: str

url

URL to download the file from if not cached or if force_download is True.

TYPE: str DEFAULT: None

force_download

If True, do not use cache and download the file from the URL.

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
BeautifulSoup

Parsed DOM.

TYPE: BeautifulSoup

Source code in src/dcmspec/xhtml_doc_handler.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def load_document(
        self, cache_file_name: str,
        url: Optional[str] = None,
        force_download: bool = False
) -> BeautifulSoup:
    """Open and parse an XHTML file, downloading it if needed.

    Args:
        cache_file_name (str): Path to the local cached XHTML file.
        url (str, optional): URL to download the file from if not cached or if force_download is True.
        force_download (bool): If True, do not use cache and download the file from the URL.

    Returns:
        BeautifulSoup: Parsed DOM.

    """
    # Set cache_file_name as an attribute for downstream use (e.g., in SpecFactory)
    self.cache_file_name = cache_file_name

    cache_file_path = os.path.join(self.config.get_param("cache_dir"), "standard", cache_file_name)
    need_download = force_download or (not os.path.exists(cache_file_path))
    if need_download:
        if not url:
            raise ValueError("URL must be provided to download the file.")
        cache_file_path = self.download(url, cache_file_name)
    return self.parse_dom(cache_file_path)

parse_dom(file_path)

Parse a cached XHTML file into a BeautifulSoup DOM object.

PARAMETER DESCRIPTION
file_path

Path to the cached XHTML file to parse.

TYPE: str

RETURNS DESCRIPTION
BeautifulSoup

The parsed DOM object.

TYPE: BeautifulSoup

RAISES DESCRIPTION
RuntimeError

If the file cannot be read or parsed.

Source code in src/dcmspec/xhtml_doc_handler.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def parse_dom(self, file_path: str) -> BeautifulSoup:
    """Parse a cached XHTML file into a BeautifulSoup DOM object.

    Args:
        file_path (str): Path to the cached XHTML file to parse.

    Returns:
        BeautifulSoup: The parsed DOM object.

    Raises:
        RuntimeError: If the file cannot be read or parsed.

    """
    self.logger.info(f"Reading XHTML DOM from {file_path}")
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
        # dom = BeautifulSoup(content, "html.parser")  # use python HTML parser. Fine for XHTML. Unreliable for XML.
        # dom = BeautifulSoup(content, "lxml")  # use lxml package parser. Default to HTML and generates a warning.
        dom = BeautifulSoup(content, features="xml")  # use lxml package parser. Force using XML. Safest choice.
        self.logger.info("XHTML DOM read successfully")

        return dom
    except OSError as e:
        self.logger.error(f"Failed to read file {file_path}: {e}")
        raise RuntimeError(f"Failed to read file {file_path}: {e}") from e
    except Exception as e:
        self.logger.error(f"Failed to parse XHTML file {file_path}: {e}")
        raise RuntimeError(f"Failed to parse XHTML file {file_path}: {e}") from e