Source code for starlingrt.sample

"""
Structures to store samples from different mass spectrometers.

Author: Nathan A. Mahynski
"""

import xlrd

import numpy as np

from starlingrt import data

from typing import Any


[docs]class MassHunterSample(data._SampleBase):
    """Class to store the MSRep.xls output from MassHunter(TM)."""

[docs]    def read(self, filename: str) -> None:
        """
        Read data from MSRep.xls file.

        This assumes a specific formatted output from MassHunter(TM) which is checked below.

        Parameters
        ----------
        filename : str
                Pathname of MSRep.xls file.
        """
        self._filename = filename  # type: ignore [misc]

        wb = xlrd.open_workbook(self._filename)
        intres = wb.sheet_by_name("IntRes")
        libres = wb.sheet_by_name("LibRes")

        # Record metadata at the top of the IntRes tab
        self._metadata = [intres.cell(i, 0).value for i in range(4)]

        # Check that the columns are as expected
        column_names = [
            "Compound number (#)",
            "RT (min)",
            "Scan number (#)",
            "Area (Ab*s)",
            "Baseline Heigth (Ab)",
            "Absolute Heigth (Ab)",
            "Peak Width 50% (min)",
            "Start Time (min)",
            "End Time (min)",
            "Start Height (Ab)",
            "End Height (Ab)",
            "Peak Type",
        ]
        assert (
            intres.row_values(5) == column_names
        ), f"Column names in the IntRes tab of {self._filename} are not as expected."

        # Read all compounds
        self._compounds = []  # type: ignore [misc]
        for row_idx in range(6, intres.nrows):
            self._compounds.append(
                data.Compound(
                    number=intres.cell(
                        row_idx, column_names.index("Compound number (#)")
                    ).value,
                    rt=intres.cell(
                        row_idx, column_names.index("RT (min)")
                    ).value,
                    scan_number=intres.cell(
                        row_idx, column_names.index("Scan number (#)")
                    ).value,
                    area=intres.cell(
                        row_idx, column_names.index("Area (Ab*s)")
                    ).value,
                    baseline_height=intres.cell(
                        row_idx, column_names.index("Baseline Heigth (Ab)")
                    ).value,
                    absolute_height=intres.cell(
                        row_idx, column_names.index("Absolute Heigth (Ab)")
                    ).value,
                    peak_width=intres.cell(
                        row_idx, column_names.index("Peak Width 50% (min)")
                    ).value,
                )
            )

        # Read all the hits for each compound

        # Check metadata at the top of the LibRes tab is the same
        check_meta = [libres.cell(i, 0).value for i in range(4)]
        assert (
            check_meta == self._metadata
        ), f"Metadata from the LibRes tab disagrees with IntRes in {self._filename}"

        # Check that the columns are as expected
        column_names = [
            "Compound number (#)",
            "RT (min)",
            "Scan number (#)",
            "Area (Ab*s)",
            "Baseline Heigth (Ab)",
            "Absolute Heigth (Ab)",
            "Peak Width 50% (min)",
            "Hit Number",
            "Hit Name",
            "Quality",
            "Mol Weight (amu)",
            "CAS Number",
            "Library",
            "Entry Number Library",
        ]
        assert (
            libres.row_values(8) == column_names
        ), f"Column names in the LibRes tab of {self._filename} are not as expected."

        # Hits for each compounds
        self._hits = {}  # type: ignore [misc]
        for row_idx in range(9, libres.nrows):
            cpd = libres.cell(
                row_idx, column_names.index("Compound number (#)")
            ).value
            if cpd:
                cpd_no = int(cpd)
                self._hits[cpd_no] = []

            self._hits[cpd_no].append(
                data.Hit(
                    number=libres.cell(
                        row_idx, column_names.index("Hit Number")
                    ).value,
                    name=libres.cell(
                        row_idx, column_names.index("Hit Name")
                    ).value,
                    quality=libres.cell(
                        row_idx, column_names.index("Quality")
                    ).value,
                    mol_weight=libres.cell(
                        row_idx, column_names.index("Mol Weight (amu)")
                    ).value,
                    cas_number=libres.cell(
                        row_idx, column_names.index("CAS Number")
                    ).value,
                    library=libres.cell(
                        row_idx, column_names.index("Library")
                    ).value,
                    entry_number_library=libres.cell(
                        row_idx, column_names.index("Entry Number Library")
                    ).value,
                )
            )

        # Check that all compounds from IntRes are in LibRes
        assert np.asarray(
            sorted(self._hits.keys()) == np.arange(1, len(self._compounds) + 1)
        ).all(), f"Hits are either not ordered correctly or are missing for certain compounds in {self._filename}"