Source code for starlingrt.data

"""
Structures to organize raw data into.

Author: Nathan A. Mahynski
"""

import copy
import hashlib
import numpy as np

from typing import Any, ClassVar


[docs]class Entry: """ Create an Entry. This is essentially a combination of Hit and Compound intended to "unroll" their information into a flat data structure more amenable for searching. """ sample_filename: ClassVar[str] compound_number: ClassVar[int] rt: ClassVar[int] scan_number: ClassVar[int] area: ClassVar[int] baseline_height: ClassVar[int] absolute_height: ClassVar[int] peak_width: ClassVar[float] hit_number: ClassVar[int] hit_name: ClassVar[str] quality: ClassVar[int] mol_weight: ClassVar[float] cas_number: ClassVar[str] library: ClassVar[str] entry_number_library: ClassVar[int] def __init__( self, sample_filename: str, compound_number: int, rt: int, scan_number: int, area: int, baseline_height: int, absolute_height: int, peak_width: float, hit_number: int, hit_name: str, quality: int, mol_weight: float, cas_number: str, library: str, entry_number_library: int, ): """ Initialize the Entry. Parameters ---------- sample_filename : str Mass spectrometer output file data was read from. compound_number : int Compound / peak integer index. rt : float Retention time of the peak. scan_number : int Scan number. area : int Peak area. baseline_height : int Baseline peak height. absolute_height: int Absolute peak height. peak_width : float Peak width. hit_number : int Number assigned to the hit. hit_name : str Hit name assigned by the library used. quality : int The quality of the identification, as reported by the library. mol_weight : float Molecular weight of the assignment. cas_number : str CAS Number of the assigned hit. library : str Library used for identification. entry_number_library : int Number the assigned compount is in the library used. """ self.set_params( **{ "sample_filename": sample_filename, "compound_number": compound_number, "rt": rt, "scan_number": scan_number, "area": area, "baseline_height": baseline_height, "absolute_height": absolute_height, "peak_width": peak_width, "hit_number": hit_number, "hit_name": hit_name, "quality": quality, "mol_weight": mol_weight, "cas_number": cas_number, "library": library, "entry_number_library": entry_number_library, } )
[docs] def set_params(self, **parameters: Any) -> "Entry": """Set parameters.""" for parameter, value in parameters.items(): setattr(self, parameter, value) return self
[docs] def get_params(self) -> dict[str, Any]: """Get parameters.""" return { "sample_filename": self.sample_filename, "compound_number": self.compound_number, "rt": self.rt, "scan_number": self.scan_number, "area": self.area, "baseline_height": self.baseline_height, "absolute_height": self.absolute_height, "peak_width": self.peak_width, "hit_number": self.hit_number, "hit_name": self.hit_name, "quality": self.quality, "mol_weight": self.mol_weight, "cas_number": self.cas_number, "library": self.library, "entry_number_library": self.entry_number_library, }
def __repr__(self): """Self-representation.""" return "<Entry at 0x{:x}>".format(id(self))
[docs]class Hit: """ A possible assignment to a peak from the library in use. Each peak (Compound) in the MSRep.xls file > LibRes tab is assigned various Hits. """ number: ClassVar[int] name: ClassVar[str] quality: ClassVar[int] mol_weight: ClassVar[float] cas_number: ClassVar[str] library: ClassVar[str] entry_number_library: ClassVar[int] def __init__( self, number: int, name: str, quality: int, mol_weight: float, cas_number: str, library: str, entry_number_library: int, ) -> None: """ Initialize the Hit. Parameters ---------- number : int Number assigned to the hit. name : str Hit name assigned by the library used. quality : int The quality of the identification, as reported by the library. mol_weight : float Molecular weight of the assignment. cas_number : str CAS Number of the assigned hit. library : str Library used for identification. entry_number : int Number the assigned compount is in the library used. """ self.set_params( **{ "number": int(number), "name": str(name), "quality": int(quality), "mol_weight": float(mol_weight), "cas_number": str(cas_number), "library": str(library), "entry_number_library": int(entry_number_library), } )
[docs] def set_params(self, **parameters: Any) -> "Hit": """Set parameters.""" for parameter, value in parameters.items(): setattr(self, parameter, value) return self
[docs] def get_params(self) -> dict[str, Any]: """Get parameters.""" return { "number": self.number, "name": self.name, "quality": self.quality, "mol_weight": self.mol_weight, "cas_number": self.cas_number, "library": self.library, "entry_number_library": self.entry_number_library, }
[docs]class Compound: """ A compound is a peak in the GCMS output that has been detected and must be assigned to one or more library Hits. Each peak (Compound) in the MSRep.xls file > LibRes tab is assigned various Hits. """ number: ClassVar[int] rt: ClassVar[float] scan_number: ClassVar[int] area: ClassVar[int] baseline_height: ClassVar[int] absolute_height: ClassVar[int] peak_width: ClassVar[float] def __init__( self, number: int, rt: float, scan_number: int, area: int, baseline_height: int, absolute_height: int, peak_width: float, ) -> None: """ Initialize the Compound. Parameters ---------- number : int Compound / peak integer index. rt : float Retention time of the peak. scan_number : int Scan number. area : int Peak area. baseline_height : int Baseline peak height. absolute_height: int Absolute peak height. peak_width : float Peak width. """ self.set_params( **{ "number": int(number), "rt": float(rt), "scan_number": int(scan_number), "area": int(area), "baseline_height": int(baseline_height), "absolute_height": int(absolute_height), "peak_width": float(peak_width), } )
[docs] def set_params(self, **parameters: Any) -> "Compound": """Set parameters.""" for parameter, value in parameters.items(): setattr(self, parameter, value) return self
[docs] def get_params(self) -> dict[str, Any]: """Get parameters.""" return { "number": self.number, "rt": self.rt, "scan_number": self.scan_number, "area": self.area, "baseline_height": self.baseline_height, "absolute_height": self.absolute_height, "peak_width": self.peak_width, }
class _SampleBase: """Base class to store the output from a mass spectrometer.""" _filename: ClassVar[str] _compounds: ClassVar[list] _hits: ClassVar[dict] def __init__(self, filename: str) -> None: """ Instantiate the Sample. Parameters ---------- filename : str Path to mass spectrometer output file to read. """ try: self.read(filename) except Exception as e: raise IOError(f"Unable to read from {filename} : {e}") @property def filename(self): return copy.copy(self._filename) @property def compounds(self): return copy.copy(self._compounds) @property def hits(self): return copy.copy(self._hits) @property def entries(self) -> list[Entry]: """ Extract all Entry from Samples. Returns ---------- all_entries : list(Entry) List of all Entry created from known Samples and their Hits. """ all_entries = [] for compound in self._compounds: for hit in self.sorted_hits(compound.number): all_entries.append( Entry( sample_filename="/".join( self._filename.split("/")[-2:] ), # Only use 1 level in directory compound_number=compound.number, rt=compound.rt, scan_number=compound.scan_number, area=compound.area, baseline_height=compound.baseline_height, absolute_height=compound.absolute_height, peak_width=compound.peak_width, hit_number=hit.number, hit_name=hit.name, quality=hit.quality, mol_weight=hit.mol_weight, cas_number=hit.cas_number, library=hit.library, entry_number_library=hit.entry_number_library, ) ) return all_entries def sorted_hits(self, compound_number: int) -> list[Hit]: """ Hits should be sorted by quality, but this makes sure. A secondary sort is done by hit number to be consistent with mass spectrometer's ordering. Parameters ---------- compound_number : int Compound number (starting from 1) in the mass spectrometer's output file. Returns ------- hits : list(Hit) Hits sorted first by quality and then by the number the mass spectrometer assigned when it performed this sort. Example ------- >>> s = Sample(...) >>> sorted_hits = s.sorted_hits(compound_number=42) """ return sorted(self._hits[compound_number], key=lambda x: (x.get_params().get("quality"), -x.get_params().get("number")), reverse=True) # type: ignore [operator] def read(self, *args, **kwargs) -> None: """ Read in the data from mass spectrometer output files. Should set the class variables: * _filename * _compounds * _hits """ raise NotImplementedError
[docs]class Utilities: """Utility functions for manipulating data structures."""
[docs] @staticmethod def create_entries(samples: list) -> dict[str, Entry]: """ Extract all Entry from samples. Parameters ---------- samples : list(_SampleBase) List of Samples collected from all directories in `input_directory`. Returns ------- total_entries : dict(str, Entry) Dictionary of all Entry in `samples` whose keys are sha1 hashes and values are Entry objects. """ total_entries = {} checksum = 0 for sample in samples: for entry in sample.entries: checksum += 1 descr_ = "_".join( [ "_".join([a, str(b)]) for a, b in sorted(list(entry.get_params().items())) ] ) hash_ = hashlib.sha1(descr_.encode("utf-8")) total_entries[hash_.hexdigest()] = entry assert len(total_entries) == checksum, "Error : hash conflicts found" return total_entries
[docs] @staticmethod def select_top_entries(total_entries: dict[str, Entry]) -> dict[str, Entry]: """ Trim down the entries to just have the top (quality) hits (i.e., `hit_number` == 1). Parameters ---------- total_entries : dict(str, Entry) Dictionary of all Entry in `samples` whose keys are sha1 hashes. Returns ------- top_entries : dict(str, Entry) Dictionary of all Entry with `hit_number` == 1 whose keys are sha1 hashes and values are Entry objects. """ top_entries = {} for k, v in total_entries.items(): if v.hit_number == 1: top_entries[k] = v return top_entries
[docs] @staticmethod def group_entries_by_name( entries: dict[str, Entry] ) -> dict[str, list[tuple[Entry, str]]]: """ Group entries with the same hit name. Parameters ---------- entries : dict(str, Entry) Dictionary of Entry whose keys are sha1 hashes and values are Entry objects. Returns ------- groups : dict(str, list(tuple(Entry, str))) Dictionary of Entry whose keys are hit names and values are tuples of (Entry objects, hash). """ groups: dict[str, list[tuple[Entry, str]]] = {} for hash, entry in entries.items(): if entry.hit_name in groups: groups[entry.hit_name].append((entry, hash)) else: groups[entry.hit_name] = [(entry, hash)] return groups
[docs] @staticmethod def group_entries_by_rt( entries: dict[str, Entry] ) -> dict[float, list[Entry]]: """ Group entries with the same retention time. Parameters ---------- entries : dict(str, Entry) Dictionary of Entry whose keys are sha1 hashes and values are Entry objects. Returns ------- groups : dict(float, list(Entry)) Dictionary of Entry whose keys are retention times and values are Entry objects. """ groups: dict[float, list[Entry]] = {} for entry in entries.values(): if entry.rt in groups: groups[entry.rt].append(entry) else: groups[entry.rt] = [entry] return groups