"""
Structures to organize raw data into.
Author: Nathan A. Mahynski
"""
import copy
import hashlib
import numpy as np
from typing import Any, ClassVar
[docs]class Entry:
"""
Create an Entry.
This is essentially a combination of Hit and Compound intended to "unroll" their information into a flat data structure more amenable for searching.
"""
sample_filename: ClassVar[str]
compound_number: ClassVar[int]
rt: ClassVar[int]
scan_number: ClassVar[int]
area: ClassVar[int]
baseline_height: ClassVar[int]
absolute_height: ClassVar[int]
peak_width: ClassVar[float]
hit_number: ClassVar[int]
hit_name: ClassVar[str]
quality: ClassVar[int]
mol_weight: ClassVar[float]
cas_number: ClassVar[str]
library: ClassVar[str]
entry_number_library: ClassVar[int]
def __init__(
self,
sample_filename: str,
compound_number: int,
rt: int,
scan_number: int,
area: int,
baseline_height: int,
absolute_height: int,
peak_width: float,
hit_number: int,
hit_name: str,
quality: int,
mol_weight: float,
cas_number: str,
library: str,
entry_number_library: int,
):
"""
Initialize the Entry.
Parameters
----------
sample_filename : str
Mass spectrometer output file data was read from.
compound_number : int
Compound / peak integer index.
rt : float
Retention time of the peak.
scan_number : int
Scan number.
area : int
Peak area.
baseline_height : int
Baseline peak height.
absolute_height: int
Absolute peak height.
peak_width : float
Peak width.
hit_number : int
Number assigned to the hit.
hit_name : str
Hit name assigned by the library used.
quality : int
The quality of the identification, as reported by the library.
mol_weight : float
Molecular weight of the assignment.
cas_number : str
CAS Number of the assigned hit.
library : str
Library used for identification.
entry_number_library : int
Number the assigned compount is in the library used.
"""
self.set_params(
**{
"sample_filename": sample_filename,
"compound_number": compound_number,
"rt": rt,
"scan_number": scan_number,
"area": area,
"baseline_height": baseline_height,
"absolute_height": absolute_height,
"peak_width": peak_width,
"hit_number": hit_number,
"hit_name": hit_name,
"quality": quality,
"mol_weight": mol_weight,
"cas_number": cas_number,
"library": library,
"entry_number_library": entry_number_library,
}
)
[docs] def set_params(self, **parameters: Any) -> "Entry":
"""Set parameters."""
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
[docs] def get_params(self) -> dict[str, Any]:
"""Get parameters."""
return {
"sample_filename": self.sample_filename,
"compound_number": self.compound_number,
"rt": self.rt,
"scan_number": self.scan_number,
"area": self.area,
"baseline_height": self.baseline_height,
"absolute_height": self.absolute_height,
"peak_width": self.peak_width,
"hit_number": self.hit_number,
"hit_name": self.hit_name,
"quality": self.quality,
"mol_weight": self.mol_weight,
"cas_number": self.cas_number,
"library": self.library,
"entry_number_library": self.entry_number_library,
}
def __repr__(self):
"""Self-representation."""
return "<Entry at 0x{:x}>".format(id(self))
[docs]class Hit:
"""
A possible assignment to a peak from the library in use.
Each peak (Compound) in the MSRep.xls file > LibRes tab is assigned various Hits.
"""
number: ClassVar[int]
name: ClassVar[str]
quality: ClassVar[int]
mol_weight: ClassVar[float]
cas_number: ClassVar[str]
library: ClassVar[str]
entry_number_library: ClassVar[int]
def __init__(
self,
number: int,
name: str,
quality: int,
mol_weight: float,
cas_number: str,
library: str,
entry_number_library: int,
) -> None:
"""
Initialize the Hit.
Parameters
----------
number : int
Number assigned to the hit.
name : str
Hit name assigned by the library used.
quality : int
The quality of the identification, as reported by the library.
mol_weight : float
Molecular weight of the assignment.
cas_number : str
CAS Number of the assigned hit.
library : str
Library used for identification.
entry_number : int
Number the assigned compount is in the library used.
"""
self.set_params(
**{
"number": int(number),
"name": str(name),
"quality": int(quality),
"mol_weight": float(mol_weight),
"cas_number": str(cas_number),
"library": str(library),
"entry_number_library": int(entry_number_library),
}
)
[docs] def set_params(self, **parameters: Any) -> "Hit":
"""Set parameters."""
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
[docs] def get_params(self) -> dict[str, Any]:
"""Get parameters."""
return {
"number": self.number,
"name": self.name,
"quality": self.quality,
"mol_weight": self.mol_weight,
"cas_number": self.cas_number,
"library": self.library,
"entry_number_library": self.entry_number_library,
}
[docs]class Compound:
"""
A compound is a peak in the GCMS output that has been detected and must be assigned to one or more library Hits.
Each peak (Compound) in the MSRep.xls file > LibRes tab is assigned various Hits.
"""
number: ClassVar[int]
rt: ClassVar[float]
scan_number: ClassVar[int]
area: ClassVar[int]
baseline_height: ClassVar[int]
absolute_height: ClassVar[int]
peak_width: ClassVar[float]
def __init__(
self,
number: int,
rt: float,
scan_number: int,
area: int,
baseline_height: int,
absolute_height: int,
peak_width: float,
) -> None:
"""
Initialize the Compound.
Parameters
----------
number : int
Compound / peak integer index.
rt : float
Retention time of the peak.
scan_number : int
Scan number.
area : int
Peak area.
baseline_height : int
Baseline peak height.
absolute_height: int
Absolute peak height.
peak_width : float
Peak width.
"""
self.set_params(
**{
"number": int(number),
"rt": float(rt),
"scan_number": int(scan_number),
"area": int(area),
"baseline_height": int(baseline_height),
"absolute_height": int(absolute_height),
"peak_width": float(peak_width),
}
)
[docs] def set_params(self, **parameters: Any) -> "Compound":
"""Set parameters."""
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
[docs] def get_params(self) -> dict[str, Any]:
"""Get parameters."""
return {
"number": self.number,
"rt": self.rt,
"scan_number": self.scan_number,
"area": self.area,
"baseline_height": self.baseline_height,
"absolute_height": self.absolute_height,
"peak_width": self.peak_width,
}
class _SampleBase:
"""Base class to store the output from a mass spectrometer."""
_filename: ClassVar[str]
_compounds: ClassVar[list]
_hits: ClassVar[dict]
def __init__(self, filename: str) -> None:
"""
Instantiate the Sample.
Parameters
----------
filename : str
Path to mass spectrometer output file to read.
"""
try:
self.read(filename)
except Exception as e:
raise IOError(f"Unable to read from {filename} : {e}")
@property
def filename(self):
return copy.copy(self._filename)
@property
def compounds(self):
return copy.copy(self._compounds)
@property
def hits(self):
return copy.copy(self._hits)
@property
def entries(self) -> list[Entry]:
"""
Extract all Entry from Samples.
Returns
----------
all_entries : list(Entry)
List of all Entry created from known Samples and their Hits.
"""
all_entries = []
for compound in self._compounds:
for hit in self.sorted_hits(compound.number):
all_entries.append(
Entry(
sample_filename="/".join(
self._filename.split("/")[-2:]
), # Only use 1 level in directory
compound_number=compound.number,
rt=compound.rt,
scan_number=compound.scan_number,
area=compound.area,
baseline_height=compound.baseline_height,
absolute_height=compound.absolute_height,
peak_width=compound.peak_width,
hit_number=hit.number,
hit_name=hit.name,
quality=hit.quality,
mol_weight=hit.mol_weight,
cas_number=hit.cas_number,
library=hit.library,
entry_number_library=hit.entry_number_library,
)
)
return all_entries
def sorted_hits(self, compound_number: int) -> list[Hit]:
"""
Hits should be sorted by quality, but this makes sure.
A secondary sort is done by hit number to be consistent with mass spectrometer's ordering.
Parameters
----------
compound_number : int
Compound number (starting from 1) in the mass spectrometer's output file.
Returns
-------
hits : list(Hit)
Hits sorted first by quality and then by the number the mass spectrometer assigned when it performed this sort.
Example
-------
>>> s = Sample(...)
>>> sorted_hits = s.sorted_hits(compound_number=42)
"""
return sorted(self._hits[compound_number], key=lambda x: (x.get_params().get("quality"), -x.get_params().get("number")), reverse=True) # type: ignore [operator]
def read(self, *args, **kwargs) -> None:
"""
Read in the data from mass spectrometer output files.
Should set the class variables:
* _filename
* _compounds
* _hits
"""
raise NotImplementedError
[docs]class Utilities:
"""Utility functions for manipulating data structures."""
[docs] @staticmethod
def create_entries(samples: list) -> dict[str, Entry]:
"""
Extract all Entry from samples.
Parameters
----------
samples : list(_SampleBase)
List of Samples collected from all directories in `input_directory`.
Returns
-------
total_entries : dict(str, Entry)
Dictionary of all Entry in `samples` whose keys are sha1 hashes and values are Entry objects.
"""
total_entries = {}
checksum = 0
for sample in samples:
for entry in sample.entries:
checksum += 1
descr_ = "_".join(
[
"_".join([a, str(b)])
for a, b in sorted(list(entry.get_params().items()))
]
)
hash_ = hashlib.sha1(descr_.encode("utf-8"))
total_entries[hash_.hexdigest()] = entry
assert len(total_entries) == checksum, "Error : hash conflicts found"
return total_entries
[docs] @staticmethod
def select_top_entries(total_entries: dict[str, Entry]) -> dict[str, Entry]:
"""
Trim down the entries to just have the top (quality) hits (i.e., `hit_number` == 1).
Parameters
----------
total_entries : dict(str, Entry)
Dictionary of all Entry in `samples` whose keys are sha1 hashes.
Returns
-------
top_entries : dict(str, Entry)
Dictionary of all Entry with `hit_number` == 1 whose keys are sha1 hashes and values are Entry objects.
"""
top_entries = {}
for k, v in total_entries.items():
if v.hit_number == 1:
top_entries[k] = v
return top_entries
[docs] @staticmethod
def group_entries_by_name(
entries: dict[str, Entry]
) -> dict[str, list[tuple[Entry, str]]]:
"""
Group entries with the same hit name.
Parameters
----------
entries : dict(str, Entry)
Dictionary of Entry whose keys are sha1 hashes and values are Entry objects.
Returns
-------
groups : dict(str, list(tuple(Entry, str)))
Dictionary of Entry whose keys are hit names and values are tuples of (Entry objects, hash).
"""
groups: dict[str, list[tuple[Entry, str]]] = {}
for hash, entry in entries.items():
if entry.hit_name in groups:
groups[entry.hit_name].append((entry, hash))
else:
groups[entry.hit_name] = [(entry, hash)]
return groups
[docs] @staticmethod
def group_entries_by_rt(
entries: dict[str, Entry]
) -> dict[float, list[Entry]]:
"""
Group entries with the same retention time.
Parameters
----------
entries : dict(str, Entry)
Dictionary of Entry whose keys are sha1 hashes and values are Entry objects.
Returns
-------
groups : dict(float, list(Entry))
Dictionary of Entry whose keys are retention times and values are Entry objects.
"""
groups: dict[float, list[Entry]] = {}
for entry in entries.values():
if entry.rt in groups:
groups[entry.rt].append(entry)
else:
groups[entry.rt] = [entry]
return groups