Source code for starlingrt.sample

"""
Structures to store samples from different mass spectrometers.

Author: Nathan A. Mahynski
"""

import xlrd

import numpy as np

from starlingrt import data

from typing import Any


[docs]class MassHunterSample(data._SampleBase): """Class to store the MSRep.xls output from MassHunter(TM)."""
[docs] def read(self, filename: str) -> None: """ Read data from MSRep.xls file. This assumes a specific formatted output from MassHunter(TM) which is checked below. Parameters ---------- filename : str Pathname of MSRep.xls file. """ self._filename = filename # type: ignore [misc] wb = xlrd.open_workbook(self._filename) intres = wb.sheet_by_name("IntRes") libres = wb.sheet_by_name("LibRes") # Record metadata at the top of the IntRes tab self._metadata = [intres.cell(i, 0).value for i in range(4)] # Check that the columns are as expected column_names = [ "Compound number (#)", "RT (min)", "Scan number (#)", "Area (Ab*s)", "Baseline Heigth (Ab)", "Absolute Heigth (Ab)", "Peak Width 50% (min)", "Start Time (min)", "End Time (min)", "Start Height (Ab)", "End Height (Ab)", "Peak Type", ] assert ( intres.row_values(5) == column_names ), f"Column names in the IntRes tab of {self._filename} are not as expected." # Read all compounds self._compounds = [] # type: ignore [misc] for row_idx in range(6, intres.nrows): self._compounds.append( data.Compound( number=intres.cell( row_idx, column_names.index("Compound number (#)") ).value, rt=intres.cell( row_idx, column_names.index("RT (min)") ).value, scan_number=intres.cell( row_idx, column_names.index("Scan number (#)") ).value, area=intres.cell( row_idx, column_names.index("Area (Ab*s)") ).value, baseline_height=intres.cell( row_idx, column_names.index("Baseline Heigth (Ab)") ).value, absolute_height=intres.cell( row_idx, column_names.index("Absolute Heigth (Ab)") ).value, peak_width=intres.cell( row_idx, column_names.index("Peak Width 50% (min)") ).value, ) ) # Read all the hits for each compound # Check metadata at the top of the LibRes tab is the same check_meta = [libres.cell(i, 0).value for i in range(4)] assert ( check_meta == self._metadata ), f"Metadata from the LibRes tab disagrees with IntRes in {self._filename}" # Check that the columns are as expected column_names = [ "Compound number (#)", "RT (min)", "Scan number (#)", "Area (Ab*s)", "Baseline Heigth (Ab)", "Absolute Heigth (Ab)", "Peak Width 50% (min)", "Hit Number", "Hit Name", "Quality", "Mol Weight (amu)", "CAS Number", "Library", "Entry Number Library", ] assert ( libres.row_values(8) == column_names ), f"Column names in the LibRes tab of {self._filename} are not as expected." # Hits for each compounds self._hits = {} # type: ignore [misc] for row_idx in range(9, libres.nrows): cpd = libres.cell( row_idx, column_names.index("Compound number (#)") ).value if cpd: cpd_no = int(cpd) self._hits[cpd_no] = [] self._hits[cpd_no].append( data.Hit( number=libres.cell( row_idx, column_names.index("Hit Number") ).value, name=libres.cell( row_idx, column_names.index("Hit Name") ).value, quality=libres.cell( row_idx, column_names.index("Quality") ).value, mol_weight=libres.cell( row_idx, column_names.index("Mol Weight (amu)") ).value, cas_number=libres.cell( row_idx, column_names.index("CAS Number") ).value, library=libres.cell( row_idx, column_names.index("Library") ).value, entry_number_library=libres.cell( row_idx, column_names.index("Entry Number Library") ).value, ) ) # Check that all compounds from IntRes are in LibRes assert np.asarray( sorted(self._hits.keys()) == np.arange(1, len(self._compounds) + 1) ).all(), f"Hits are either not ordered correctly or are missing for certain compounds in {self._filename}"