Skip to content

DocHandler

dcmspec.doc_handler.DocHandler

Base class for DICOM document handlers.

Handles DICOM documents in various formats (e.g., XHTML, PDF). Subclasses must implement the load_document method to handle reading/parsing input files. The base class provides a generic download method for both text and binary files.

Source code in src/dcmspec/doc_handler.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
class DocHandler:
    """Base class for DICOM document handlers.

    Handles DICOM documents in various formats (e.g., XHTML, PDF).
    Subclasses must implement the `load_document` method to handle
    reading/parsing input files. The base class provides a generic
    download method for both text and binary files.
    """

    def __init__(self, config: Optional[Config] = None, logger: Optional[logging.Logger] = None):
        """Initialize the document handler with an optional logger.

        Args:
            config (Optional[Config]): Config instance to use. If None, a default Config is created.
            logger (Optional[logging.Logger]): Logger instance to use. If None, a default logger is created.

        """
        if logger is not None and not isinstance(logger, logging.Logger):
            raise TypeError("logger must be an instance of logging.Logger or None")
        self.logger = logger or logging.getLogger(self.__class__.__name__)

        # Add a StreamHandler and set level if there are no handlers
        if not self.logger.handlers:
            self.logger.setLevel(logging.INFO)
            console_handler = logging.StreamHandler()
            console_handler.setLevel(logging.INFO)
            self.logger.addHandler(console_handler)

        if config is not None and not isinstance(config, Config):
            raise TypeError("config must be an instance of Config or None")
        self.config = config or Config()

    def download(self, url: str, file_path: str, binary: bool = False) -> str:
        """Download a file from a URL and save it to the specified path.

        Downloads a file from the given URL and saves it to the specified file path.
        By default, saves as text (UTF-8); if binary is True, saves as binary (for PDFs, images, etc).
        Subclasses may override this method or the `clean_text` hook for format-specific processing.

        Args:
            url (str): The URL to download the file from.
            file_path (str): The path to save the downloaded file.
            binary (bool): If True, save as binary. If False, save as UTF-8 text.

        Returns:
            str: The file path where the document was saved.

        Raises:
            RuntimeError: If the download or save fails.

        """
        import requests
        self.logger.info(f"Downloading document from {url} to {file_path}")
        try:
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
        except OSError as e:
            self.logger.error(f"Failed to create directory for {file_path}: {e}")
            raise RuntimeError(f"Failed to create directory for {file_path}: {e}") from e
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            if binary:
                with open(file_path, "wb") as f:
                    f.write(response.content)
            else:
                content = response.text
                content = self.clean_text(content)
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(content)
            self.logger.info(f"Document downloaded to {file_path}")
            return file_path
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Failed to download {url}: {e}")
            raise RuntimeError(f"Failed to download {url}: {e}") from e
        except OSError as e:
            self.logger.error(f"Failed to save file {file_path}: {e}")
            raise RuntimeError(f"Failed to save file {file_path}: {e}") from e

    def clean_text(self, text: str) -> str:
        """Clean text content before saving.

        Subclasses can override this to perform format-specific cleaning (e.g., remove ZWSP/NBSP for XHTML).
        By default, returns the text unchanged.

        Args:
            text (str): The text content to clean.

        Returns:
            str: The cleaned text.

        """
        return text

    def load_document(
        self,
        cache_file_name: str,
        url: Optional[str] = None,
        force_download: bool = False,
        *args: Any,
        **kwargs: Any
    ) -> Any:
        """Implement this method to read and parse the document file, returning a parsed object.

        Subclasses should implement this method to load and parse a document file
        (e.g., XHTML, PDF, CSV) and return a format-specific parsed object.
        The exact type of the returned object depends on the subclass
        (e.g., BeautifulSoup for XHTML, pdfplumber.PDF for PDF).

        Args:
            cache_file_name (str): Path or name of the local cached file.
            url (str, optional): URL to download the file from if not cached or if force_download is True.
            force_download (bool, optional): If True, download the file even if it exists locally.
            *args: Additional positional arguments for format-specific loading.
            **kwargs: Additional keyword arguments for format-specific loading.

        Returns:
            Any: The parsed document object (type depends on subclass).

        """
        raise NotImplementedError("Subclasses must implement load_document()")

__init__(config=None, logger=None)

Initialize the document handler with an optional logger.

PARAMETER DESCRIPTION
config

Config instance to use. If None, a default Config is created.

TYPE: Optional[Config] DEFAULT: None

logger

Logger instance to use. If None, a default logger is created.

TYPE: Optional[Logger] DEFAULT: None

Source code in src/dcmspec/doc_handler.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __init__(self, config: Optional[Config] = None, logger: Optional[logging.Logger] = None):
    """Initialize the document handler with an optional logger.

    Args:
        config (Optional[Config]): Config instance to use. If None, a default Config is created.
        logger (Optional[logging.Logger]): Logger instance to use. If None, a default logger is created.

    """
    if logger is not None and not isinstance(logger, logging.Logger):
        raise TypeError("logger must be an instance of logging.Logger or None")
    self.logger = logger or logging.getLogger(self.__class__.__name__)

    # Add a StreamHandler and set level if there are no handlers
    if not self.logger.handlers:
        self.logger.setLevel(logging.INFO)
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)
        self.logger.addHandler(console_handler)

    if config is not None and not isinstance(config, Config):
        raise TypeError("config must be an instance of Config or None")
    self.config = config or Config()

clean_text(text)

Clean text content before saving.

Subclasses can override this to perform format-specific cleaning (e.g., remove ZWSP/NBSP for XHTML). By default, returns the text unchanged.

PARAMETER DESCRIPTION
text

The text content to clean.

TYPE: str

RETURNS DESCRIPTION
str

The cleaned text.

TYPE: str

Source code in src/dcmspec/doc_handler.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def clean_text(self, text: str) -> str:
    """Clean text content before saving.

    Subclasses can override this to perform format-specific cleaning (e.g., remove ZWSP/NBSP for XHTML).
    By default, returns the text unchanged.

    Args:
        text (str): The text content to clean.

    Returns:
        str: The cleaned text.

    """
    return text

download(url, file_path, binary=False)

Download a file from a URL and save it to the specified path.

Downloads a file from the given URL and saves it to the specified file path. By default, saves as text (UTF-8); if binary is True, saves as binary (for PDFs, images, etc). Subclasses may override this method or the clean_text hook for format-specific processing.

PARAMETER DESCRIPTION
url

The URL to download the file from.

TYPE: str

file_path

The path to save the downloaded file.

TYPE: str

binary

If True, save as binary. If False, save as UTF-8 text.

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
str

The file path where the document was saved.

TYPE: str

RAISES DESCRIPTION
RuntimeError

If the download or save fails.

Source code in src/dcmspec/doc_handler.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def download(self, url: str, file_path: str, binary: bool = False) -> str:
    """Download a file from a URL and save it to the specified path.

    Downloads a file from the given URL and saves it to the specified file path.
    By default, saves as text (UTF-8); if binary is True, saves as binary (for PDFs, images, etc).
    Subclasses may override this method or the `clean_text` hook for format-specific processing.

    Args:
        url (str): The URL to download the file from.
        file_path (str): The path to save the downloaded file.
        binary (bool): If True, save as binary. If False, save as UTF-8 text.

    Returns:
        str: The file path where the document was saved.

    Raises:
        RuntimeError: If the download or save fails.

    """
    import requests
    self.logger.info(f"Downloading document from {url} to {file_path}")
    try:
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
    except OSError as e:
        self.logger.error(f"Failed to create directory for {file_path}: {e}")
        raise RuntimeError(f"Failed to create directory for {file_path}: {e}") from e
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        if binary:
            with open(file_path, "wb") as f:
                f.write(response.content)
        else:
            content = response.text
            content = self.clean_text(content)
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(content)
        self.logger.info(f"Document downloaded to {file_path}")
        return file_path
    except requests.exceptions.RequestException as e:
        self.logger.error(f"Failed to download {url}: {e}")
        raise RuntimeError(f"Failed to download {url}: {e}") from e
    except OSError as e:
        self.logger.error(f"Failed to save file {file_path}: {e}")
        raise RuntimeError(f"Failed to save file {file_path}: {e}") from e

load_document(cache_file_name, url=None, force_download=False, *args, **kwargs)

Implement this method to read and parse the document file, returning a parsed object.

Subclasses should implement this method to load and parse a document file (e.g., XHTML, PDF, CSV) and return a format-specific parsed object. The exact type of the returned object depends on the subclass (e.g., BeautifulSoup for XHTML, pdfplumber.PDF for PDF).

PARAMETER DESCRIPTION
cache_file_name

Path or name of the local cached file.

TYPE: str

url

URL to download the file from if not cached or if force_download is True.

TYPE: str DEFAULT: None

force_download

If True, download the file even if it exists locally.

TYPE: bool DEFAULT: False

*args

Additional positional arguments for format-specific loading.

TYPE: Any DEFAULT: ()

**kwargs

Additional keyword arguments for format-specific loading.

TYPE: Any DEFAULT: {}

RETURNS DESCRIPTION
Any

The parsed document object (type depends on subclass).

TYPE: Any

Source code in src/dcmspec/doc_handler.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def load_document(
    self,
    cache_file_name: str,
    url: Optional[str] = None,
    force_download: bool = False,
    *args: Any,
    **kwargs: Any
) -> Any:
    """Implement this method to read and parse the document file, returning a parsed object.

    Subclasses should implement this method to load and parse a document file
    (e.g., XHTML, PDF, CSV) and return a format-specific parsed object.
    The exact type of the returned object depends on the subclass
    (e.g., BeautifulSoup for XHTML, pdfplumber.PDF for PDF).

    Args:
        cache_file_name (str): Path or name of the local cached file.
        url (str, optional): URL to download the file from if not cached or if force_download is True.
        force_download (bool, optional): If True, download the file even if it exists locally.
        *args: Additional positional arguments for format-specific loading.
        **kwargs: Additional keyword arguments for format-specific loading.

    Returns:
        Any: The parsed document object (type depends on subclass).

    """
    raise NotImplementedError("Subclasses must implement load_document()")