DocHandler

`dcmspec.doc_handler.DocHandler`

Base class for DICOM document handlers.

Handles DICOM documents in various formats (e.g., XHTML, PDF). Subclasses must implement the load_document method to handle reading/parsing input files. The base class provides a generic download method for both text and binary files.

Source code in src/dcmspec/doc_handler.py

class DocHandler:
    """Base class for DICOM document handlers.

    Handles DICOM documents in various formats (e.g., XHTML, PDF).
    Subclasses must implement the `load_document` method to handle
    reading/parsing input files. The base class provides a generic
    download method for both text and binary files.
    """

    def __init__(self, config: Optional[Config] = None, logger: Optional[logging.Logger] = None):
        """Initialize the document handler with an optional logger.

        Args:
            config (Optional[Config]): Config instance to use. If None, a default Config is created.
            logger (Optional[logging.Logger]): Logger instance to use. If None, a default logger is created.

        """
        if logger is not None and not isinstance(logger, logging.Logger):
            raise TypeError("logger must be an instance of logging.Logger or None")
        self.logger = logger or logging.getLogger(self.__class__.__name__)

        # Add a StreamHandler and set level if there are no handlers
        if not self.logger.handlers:
            self.logger.setLevel(logging.INFO)
            console_handler = logging.StreamHandler()
            console_handler.setLevel(logging.INFO)
            self.logger.addHandler(console_handler)

        if config is not None and not isinstance(config, Config):
            raise TypeError("config must be an instance of Config or None")
        self.config = config or Config()

    def download(self, url: str, file_path: str, binary: bool = False) -> str:
        """Download a file from a URL and save it to the specified path.

        Downloads a file from the given URL and saves it to the specified file path.
        By default, saves as text (UTF-8); if binary is True, saves as binary (for PDFs, images, etc).
        Subclasses may override this method or the `clean_text` hook for format-specific processing.

        Args:
            url (str): The URL to download the file from.
            file_path (str): The path to save the downloaded file.
            binary (bool): If True, save as binary. If False, save as UTF-8 text.

        Returns:
            str: The file path where the document was saved.

        Raises:
            RuntimeError: If the download or save fails.

        """
        import requests
        self.logger.info(f"Downloading document from {url} to {file_path}")
        try:
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
        except OSError as e:
            self.logger.error(f"Failed to create directory for {file_path}: {e}")
            raise RuntimeError(f"Failed to create directory for {file_path}: {e}") from e
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            if binary:
                with open(file_path, "wb") as f:
                    f.write(response.content)
            else:
                content = response.text
                content = self.clean_text(content)
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(content)
            self.logger.info(f"Document downloaded to {file_path}")
            return file_path
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Failed to download {url}: {e}")
            raise RuntimeError(f"Failed to download {url}: {e}") from e
        except OSError as e:
            self.logger.error(f"Failed to save file {file_path}: {e}")
            raise RuntimeError(f"Failed to save file {file_path}: {e}") from e

    def clean_text(self, text: str) -> str:
        """Clean text content before saving.

        Subclasses can override this to perform format-specific cleaning (e.g., remove ZWSP/NBSP for XHTML).
        By default, returns the text unchanged.

        Args:
            text (str): The text content to clean.

        Returns:
            str: The cleaned text.

        """
        return text

    def load_document(
        self,
        cache_file_name: str,
        url: Optional[str] = None,
        force_download: bool = False,
        *args: Any,
        **kwargs: Any
    ) -> Any:
        """Implement this method to read and parse the document file, returning a parsed object.

        Subclasses should implement this method to load and parse a document file
        (e.g., XHTML, PDF, CSV) and return a format-specific parsed object.
        The exact type of the returned object depends on the subclass
        (e.g., BeautifulSoup for XHTML, pdfplumber.PDF for PDF).

        Args:
            cache_file_name (str): Path or name of the local cached file.
            url (str, optional): URL to download the file from if not cached or if force_download is True.
            force_download (bool, optional): If True, download the file even if it exists locally.
            *args: Additional positional arguments for format-specific loading.
            **kwargs: Additional keyword arguments for format-specific loading.

        Returns:
            Any: The parsed document object (type depends on subclass).

        """
        raise NotImplementedError("Subclasses must implement load_document()")

`init(config=None, logger=None)`

Initialize the document handler with an optional logger.

PARAMETER	DESCRIPTION
`config`	Config instance to use. If None, a default Config is created. TYPE: `Optional[Config]` DEFAULT: `None`
`logger`	Logger instance to use. If None, a default logger is created. TYPE: `Optional[Logger]` DEFAULT: `None`

Source code in src/dcmspec/doc_handler.py

def __init__(self, config: Optional[Config] = None, logger: Optional[logging.Logger] = None):
    """Initialize the document handler with an optional logger.

    Args:
        config (Optional[Config]): Config instance to use. If None, a default Config is created.
        logger (Optional[logging.Logger]): Logger instance to use. If None, a default logger is created.

    """
    if logger is not None and not isinstance(logger, logging.Logger):
        raise TypeError("logger must be an instance of logging.Logger or None")
    self.logger = logger or logging.getLogger(self.__class__.__name__)

    # Add a StreamHandler and set level if there are no handlers
    if not self.logger.handlers:
        self.logger.setLevel(logging.INFO)
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)
        self.logger.addHandler(console_handler)

    if config is not None and not isinstance(config, Config):
        raise TypeError("config must be an instance of Config or None")
    self.config = config or Config()

`clean_text(text)`

Clean text content before saving.

Subclasses can override this to perform format-specific cleaning (e.g., remove ZWSP/NBSP for XHTML). By default, returns the text unchanged.

PARAMETER	DESCRIPTION
`text`	The text content to clean. TYPE: `str`

RETURNS	DESCRIPTION
`str`	The cleaned text. TYPE: `str`

Source code in src/dcmspec/doc_handler.py

def clean_text(self, text: str) -> str:
    """Clean text content before saving.

    Subclasses can override this to perform format-specific cleaning (e.g., remove ZWSP/NBSP for XHTML).
    By default, returns the text unchanged.

    Args:
        text (str): The text content to clean.

    Returns:
        str: The cleaned text.

    """
    return text

`download(url, file_path, binary=False)`

Download a file from a URL and save it to the specified path.

Downloads a file from the given URL and saves it to the specified file path. By default, saves as text (UTF-8); if binary is True, saves as binary (for PDFs, images, etc). Subclasses may override this method or the clean_text hook for format-specific processing.

PARAMETER	DESCRIPTION
`url`	The URL to download the file from. TYPE: `str`
`file_path`	The path to save the downloaded file. TYPE: `str`
`binary`	If True, save as binary. If False, save as UTF-8 text. TYPE: `bool` DEFAULT: `False`

RETURNS	DESCRIPTION
`str`	The file path where the document was saved. TYPE: `str`

RAISES	DESCRIPTION
`RuntimeError`	If the download or save fails.

Source code in src/dcmspec/doc_handler.py

def download(self, url: str, file_path: str, binary: bool = False) -> str:
    """Download a file from a URL and save it to the specified path.

    Downloads a file from the given URL and saves it to the specified file path.
    By default, saves as text (UTF-8); if binary is True, saves as binary (for PDFs, images, etc).
    Subclasses may override this method or the `clean_text` hook for format-specific processing.

    Args:
        url (str): The URL to download the file from.
        file_path (str): The path to save the downloaded file.
        binary (bool): If True, save as binary. If False, save as UTF-8 text.

    Returns:
        str: The file path where the document was saved.

    Raises:
        RuntimeError: If the download or save fails.

    """
    import requests
    self.logger.info(f"Downloading document from {url} to {file_path}")
    try:
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
    except OSError as e:
        self.logger.error(f"Failed to create directory for {file_path}: {e}")
        raise RuntimeError(f"Failed to create directory for {file_path}: {e}") from e
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        if binary:
            with open(file_path, "wb") as f:
                f.write(response.content)
        else:
            content = response.text
            content = self.clean_text(content)
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(content)
        self.logger.info(f"Document downloaded to {file_path}")
        return file_path
    except requests.exceptions.RequestException as e:
        self.logger.error(f"Failed to download {url}: {e}")
        raise RuntimeError(f"Failed to download {url}: {e}") from e
    except OSError as e:
        self.logger.error(f"Failed to save file {file_path}: {e}")
        raise RuntimeError(f"Failed to save file {file_path}: {e}") from e

`load_document(cache_file_name, url=None, force_download=False, *args, **kwargs)`

Implement this method to read and parse the document file, returning a parsed object.

Subclasses should implement this method to load and parse a document file (e.g., XHTML, PDF, CSV) and return a format-specific parsed object. The exact type of the returned object depends on the subclass (e.g., BeautifulSoup for XHTML, pdfplumber.PDF for PDF).

PARAMETER	DESCRIPTION
`cache_file_name`	Path or name of the local cached file. TYPE: `str`
`url`	URL to download the file from if not cached or if force_download is True. TYPE: `str` DEFAULT: `None`
`force_download`	If True, download the file even if it exists locally. TYPE: `bool` DEFAULT: `False`
`*args`	Additional positional arguments for format-specific loading. TYPE: `Any` DEFAULT: `()`
`**kwargs`	Additional keyword arguments for format-specific loading. TYPE: `Any` DEFAULT: `{}`

RETURNS	DESCRIPTION
`Any`	The parsed document object (type depends on subclass). TYPE: `Any`

Source code in src/dcmspec/doc_handler.py

def load_document(
    self,
    cache_file_name: str,
    url: Optional[str] = None,
    force_download: bool = False,
    *args: Any,
    **kwargs: Any
) -> Any:
    """Implement this method to read and parse the document file, returning a parsed object.

    Subclasses should implement this method to load and parse a document file
    (e.g., XHTML, PDF, CSV) and return a format-specific parsed object.
    The exact type of the returned object depends on the subclass
    (e.g., BeautifulSoup for XHTML, pdfplumber.PDF for PDF).

    Args:
        cache_file_name (str): Path or name of the local cached file.
        url (str, optional): URL to download the file from if not cached or if force_download is True.
        force_download (bool, optional): If True, download the file even if it exists locally.
        *args: Additional positional arguments for format-specific loading.
        **kwargs: Additional keyword arguments for format-specific loading.

    Returns:
        Any: The parsed document object (type depends on subclass).

    """
    raise NotImplementedError("Subclasses must implement load_document()")

DocHandler

dcmspec.doc_handler.DocHandler

__init__(config=None, logger=None)

clean_text(text)

download(url, file_path, binary=False)

load_document(cache_file_name, url=None, force_download=False, *args, **kwargs)

`dcmspec.doc_handler.DocHandler`

`init(config=None, logger=None)`

`clean_text(text)`

`download(url, file_path, binary=False)`

`load_document(cache_file_name, url=None, force_download=False, *args, **kwargs)`