Source code for starlingrt.visualize

"""
Visualize GCMS data to determine consensus and any corrections necessary.

Author: Nathan A. Mahynski
"""
import numpy as np
from starlingrt import functions, data

from bokeh.plotting import figure, output_file, save
from bokeh.layouts import layout
from bokeh.models import ColumnDataSource, DataTable, HoverTool, HoverTool, CrosshairTool, FileInput, SelectEditor, CustomJS, Segment, VBar, Rect, Button, TextInput, RadioButtonGroup, Span, StringFormatter, TableColumn, RangeSlider, Slider, Select, CDSView, IndexFilter, HTMLTemplateFormatter  # type: ignore [attr-defined]


[docs]def make( top_entries: dict[str, data.Entry], width: int, threshold: float, output_filename: str = "summary.html", ) -> None: """ Make the interactive HTML document for users to inspect. Parameters ---------- top_entries : dict(str, Entry) Top entries (hit number = 1) labeled by their sha1 hash. width : int Width of the HTML table output. threshold : float Minimum retention time gap between consecutive compounds to be resolved as different. output_filename : str, optional(default="summary.html") Name of HTML file to save results to. """ # Get Data and perform initial calculations in python df, name_groups, ordered_cats = functions.get_dataframe(top_entries) df_iqr = functions.get_quantiles_df(name_groups) by_name = data.Utilities.group_entries_by_name(top_entries) # Create column for changeable name rt_groups = functions.group_by_rt_step(df, threshold) suggested_name, ties, entropy = functions.suggest_names(rt_groups) df = functions.assign_suggestions(df, rt_groups, suggested_name, ties) df["new_name"] = df["suggested_name"].copy() # Create sources for Bokeh JavaScript total_source = ColumnDataSource(df) iqr_source = ColumnDataSource(df_iqr) view = CDSView(filter=IndexFilter(np.arange(len(df)).tolist())) # Make Table compound_names = sorted(df["hit_name"].unique()) + [ "unknown" ] # Lowercase 'unknown' for easier autocomplete filenames = sorted(df["origin"].unique()) # https://stackoverflow.com/questions/50996875/how-to-color-rows-and-or-cells-in-a-bokeh-datatable template = """ <div style="background:<%= (function colorfromint(){ if (flag === "" && hit_name === suggested_name) {return("green")} else if (flag === "") {return("yellow")} else {return("red")} }()) %>; color: black"> <%= value %> </div> """ formatter = HTMLTemplateFormatter(template=template) columns = [ TableColumn( field="rt", title="Retention Time", ), TableColumn( field="quality", title="Quality", ), TableColumn( field="hit_name", title="Original Name", # Original name is not changeable formatter=StringFormatter(font_style="bold"), ), TableColumn( field="suggested_name", title="Suggested Name", # Computer suggested name is not changeable formatter=StringFormatter(font_style="bold"), ), TableColumn( field="second_suggestion", title="Second Suggestion", # Computer suggested name is not changeable formatter=StringFormatter(font_style="bold"), ), TableColumn( field="new_name", title="Assigned Name", editor=SelectEditor( options=compound_names ), # Can be modified based on user choice formatter=formatter, ), TableColumn( field="hit_number", title="Hit Number", formatter=StringFormatter(font_style="bold"), ), TableColumn( field="peak_width", title="Peak Width", ), TableColumn( field="area", title="Area", ), TableColumn( field="origin", title="Filename", ), TableColumn( field="flag", title="Flag", ), TableColumn( field="hash", title="Entry Hash", ), ] data_table = DataTable( source=total_source, view=view, columns=columns, # Based on all possibilities regardless of quality editable=True, # False width=width, index_position=-1, index_header="Index", index_width=60, ) # Make Figure p = figure( background_fill_color="#efefef", x_range=ordered_cats, title="Retention Time Ranges", width=width, height=int(width * 0.6), tools="pan,wheel_zoom,ybox_select,xbox_select,lasso_select,reset", active_drag="pan", x_axis_label="", y_axis_label="Retention Time", ) # Whisker plot # Stems upper_seg = Segment( x0="new_name", y0="upper", x1="new_name", y1="q3", line_color="black", line_alpha=0.6, ) lower_seg = Segment( x0="new_name", y0="lower", x1="new_name", y1="q1", line_color="black", line_alpha=0.6, ) p.add_glyph(iqr_source, upper_seg) p.add_glyph(iqr_source, lower_seg) # Boxes upper_bar = VBar( x="new_name", width=0.7, bottom="q2", top="q3", fill_color="#E08E79", line_color="black", fill_alpha=0.6, line_alpha=0.6, ) lower_bar = VBar( x="new_name", width=0.7, bottom="q1", top="q2", fill_color="#3B8686", line_color="black", fill_alpha=0.6, line_alpha=0.6, ) p.add_glyph(iqr_source, upper_bar) p.add_glyph(iqr_source, lower_bar) # Whiskers (almost-0 height rects simpler than segments) upper_whisker = Rect( x="new_name", y="upper", width=0.2, height=0.0001, line_color="black", fill_color="black", fill_alpha=0.6, line_alpha=0.6, ) lower_whisker = Rect( x="new_name", y="lower", width=0.2, height=0.0001, line_color="black", fill_color="black", fill_alpha=0.6, line_alpha=0.6, ) p.add_glyph(iqr_source, upper_whisker) p.add_glyph(iqr_source, lower_whisker) # Build IQR inspection plots k_step = 0.5 k_values = np.arange(1.0, 3.51, k_step).tolist() change_hist_code = """ var name = compound_select.value; var k = k_select.value.toString(); var new_data = {}; new_data['top'] = compounds_hist[name]; new_data['bottom'] = Array(compounds_hist[name].length).fill(0); new_data['right'] = compounds_edges[name].slice(1); new_data['left'] = compounds_edges[name].slice(0, -1); source.data = new_data; source.change.emit(); var new_dict_c = {} new_dict_c['x'] = points[name][k]['concern']['x']; new_dict_c['y'] = points[name][k]['concern']['y']; concerns.data = new_dict_c; var new_dict_nc = {} new_dict_nc['x'] = points[name][k]['not_concern']['x']; new_dict_nc['y'] = points[name][k]['not_concern']['y']; not_concern.data = new_dict_nc; concerns.change.emit(); not_concern.change.emit(); var curr_data = {}; curr_data['x'] = []; curr_data['y'] = []; curr_data['Index'] = []; for (let i = 0; i < view.filter.indices.length; ++i) { let idx = view.filter.indices[i]; if (total_source.data.new_name[idx] === name) { curr_data['x'].push(total_source.data.rt[idx]); curr_data['y'].push(curr_data['x'].length); curr_data['Index'].push(idx); } } source_current.data = curr_data; source_current.change.emit(); """ compounds_hist, compounds_edges, points = functions.make_histograms( by_name, k_values, bins=10 ) def compare_factory(dropdown=True): compare = figure( y_axis_label="Counts", x_axis_label="Retention Time", ) name = list(by_name.keys())[0] source = ColumnDataSource( dict( left=compounds_edges[name][:-1], right=compounds_edges[name][1:], top=compounds_hist[name], bottom=[0] * len(compounds_hist[name]), ) ) source_concerns = ColumnDataSource( dict( x=points[name][str(np.min(k_values))]["concern"]["x"], y=points[name][str(np.min(k_values))]["concern"]["y"], ) ) source_not_concern = ColumnDataSource( dict( x=points[name][str(np.min(k_values))]["not_concern"]["x"], y=points[name][str(np.min(k_values))]["not_concern"]["y"], ) ) current = {"x": [], "y": [], "Index": []} for idx in view.filter.indices: if total_source.data["new_name"][idx] == name: current["x"].append(total_source.data["rt"][idx]) current["y"].append(len(current["x"])) current["Index"].append(idx) source_current = ColumnDataSource( dict(x=current["x"], y=current["y"], Index=current["Index"]) ) compare.quad( source=source, fill_color="navy", line_color="white", alpha=0.5 ) compare.circle( x="x", y="y", color="red", size=10, alpha=0.5, source=source_concerns, legend_label="(Original) Concerning", ) compare.circle( x="x", y="y", color="blue", size=10, alpha=0.5, source=source_not_concern, legend_label="(Original) Not Concerning", ) current_points = compare.star( x="x", y="y", color="black", size=10, alpha=0.5, source=source_current, legend_label="(Visible) Currently Assigned", ) compare.legend.location = "top_left" compare.legend.click_policy = "hide" if dropdown: # Downdown menu for compounds compound_select = Select( title="Select Compound", value=name, options=[ name for name, entries in sorted( by_name.items(), key=lambda x: np.median([e[0].rt for e in x[1]]), ) ], ) # Sort by RT else: # Manually enter compound_select = TextInput(title="Select Compound", value=name) k_select = Select( title="K Value", value=str(np.min(k_values)), options=[str(x) for x in k_values], ) callback = CustomJS( args=dict( source=source, concerns=source_concerns, not_concern=source_not_concern, source_current=source_current, view=view, total_source=total_source, compounds_hist=compounds_hist, compounds_edges=compounds_edges, points=points, compound_select=compound_select, k_select=k_select, ), code=change_hist_code, ) compound_select.js_on_change("value", callback) k_select.js_on_change("value", callback) htool = HoverTool( renderers=[current_points], tooltips=[("RT", "@x"), ("Index", "@Index")], ) compare.add_tools(htool) return ( compare, source, source_concerns, source_not_concern, source_current, compound_select, k_select, ) ( compare_1, source_1, source_concerns_1, source_not_concern_1, source_current_1, compound_select_1, k_select_1, ) = compare_factory(dropdown=True) ( compare_2, source_2, source_concerns_2, source_not_concern_2, source_current_2, compound_select_2, k_select_2, ) = compare_factory(dropdown=True) ( compare_3, source_3, source_concerns_3, source_not_concern_3, source_current_3, compound_select_3, k_select_3, ) = compare_factory(dropdown=True) # Link plots with crosshair i_width = Span(dimension="width", line_dash="dashed", line_width=2) i_height = Span(dimension="height", line_dash="dotted", line_width=2) compare_1.add_tools(CrosshairTool(overlay=[i_width, i_height])) compare_2.add_tools(CrosshairTool(overlay=[i_width, i_height])) compare_3.add_tools(CrosshairTool(overlay=[i_width, i_height])) ( alt_compare_1, alt_source_1, alt_source_concerns_1, alt_source_not_concern_1, alt_source_current_1, alt_compound_select_1, alt_k_select_1, ) = compare_factory(dropdown=False) ( alt_compare_2, alt_source_2, alt_source_concerns_2, alt_source_not_concern_2, alt_source_current_2, alt_compound_select_2, alt_k_select_2, ) = compare_factory(dropdown=False) ( alt_compare_3, alt_source_3, alt_source_concerns_3, alt_source_not_concern_3, alt_source_current_3, alt_compound_select_3, alt_k_select_3, ) = compare_factory(dropdown=False) # Link plots with crosshair alt_i_width = Span(dimension="width", line_dash="dashed", line_width=2) alt_i_height = Span(dimension="height", line_dash="dotted", line_width=2) alt_compare_1.add_tools(CrosshairTool(overlay=[alt_i_width, alt_i_height])) alt_compare_2.add_tools(CrosshairTool(overlay=[alt_i_width, alt_i_height])) alt_compare_3.add_tools(CrosshairTool(overlay=[alt_i_width, alt_i_height])) # Min observation slider (filter) min_obs_slider = Slider( start=0, end=name_groups["hit_name"] .count() .max(), # Based on original assignment value=0, step=1, title="Minimum Observations within Quality Range", ) # Minimum Quality slider (filter) quality_slider = RangeSlider( start=0, end=100, step=1, value=(1, 99), title="Quality Range" ) # This computes all the statistics, summaries, etc. to display in various glyphs. This is all in one # function so that different events, when triggered, will all call the same function and update # everything together to remain in sync. recompute_code = """ var qvals = quality_slider.value; view.filter.indices = []; // Get the data from the data sources var t = total_source.data var iqr_ = iqr_source.data; var iqr_data = {}; iqr_.new_name = [] iqr_.q1 = [] iqr_.q2 = [] iqr_.q3 = [] iqr_.upper = [] iqr_.lower = [] var counts = {}; for (var i = 0; i < t.index.length; i++) { counts[t.new_name[i]] = 0; } for (var i = 0; i < t.index.length; i++) { counts[t.new_name[i]] += 1; } // Initialize IQR data for (var i = 0; i < t.index.length; i++) { if ((qvals[0] <= t.quality[i]) && (t.quality[i] <= qvals[1]) && (counts[t.new_name[i]] > min_obs_slider.value)) { iqr_data[t.new_name[i]] = []; } } var data_1 = {}; data_1['x'] = []; data_1['y'] = []; data_1['Index'] = []; var data_2 = {}; data_2['x'] = []; data_2['y'] = []; data_2['Index'] = []; var data_3 = {}; data_3['x'] = []; data_3['y'] = []; data_3['Index'] = []; // Update the visible data for (var i = 0; i < t.index.length; i++) { if ((qvals[0] <= t.quality[i]) && (t.quality[i] <= qvals[1]) && (counts[t.new_name[i]] > min_obs_slider.value)) { view.filter.indices.push(i); // Accumulate rentention times iqr_data[t.new_name[i]].push(t.rt[i]); if (t.new_name[i] === compound_select_1.value) { data_1['x'].push(t.rt[i]); data_1['y'].push(data_1['x'].length); data_1['Index'].push(t.index[i]); } if (t.new_name[i] === compound_select_2.value) { data_2['x'].push(t.rt[i]); data_2['y'].push(data_2['x'].length); data_2['Index'].push(t.index[i]); } if (t.new_name[i] === compound_select_3.value) { data_3['x'].push(t.rt[i]); data_3['y'].push(data_3['x'].length); data_3['Index'].push(t.index[i]); } } } source_current_1.data = data_1; source_current_2.data = data_2; source_current_3.data = data_3; view.change.emit(); source_current_1.change.emit(); source_current_2.change.emit(); source_current_3.change.emit(); // Update IQR visuals // From: https://www.geeksforgeeks.org/interquartile-range-iqr/ function QUANTILE(data, q) { // R-7 method: https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample let values = data.slice().sort((a, b) => a - b); // copy and sort let h = (data.length - 1)*q; if (h % 1 === 0) { return values[h] } else { let base = Math.floor(h); return values[base] + (h - base)*(values[Math.ceil(h)] - values[base]); } } var iqr_values = [0, 0, 0]; for (const [new_name, values] of Object.entries(iqr_data)) { iqr_values = [QUANTILE(values, 0.25), QUANTILE(values, 0.5), QUANTILE(values, 0.75)]; iqr_.new_name.push(new_name); iqr_.q1.push(iqr_values[0]); iqr_.q2.push(iqr_values[1]); iqr_.q3.push(iqr_values[2]); iqr_.lower.push(iqr_values[0] - 1.5*(iqr_values[2] - iqr_values[0])); iqr_.upper.push(iqr_values[2] + 1.5*(iqr_values[2] - iqr_values[0])); } iqr_source.change.emit(); total_source.change.emit(); """ recompute = CustomJS( args=dict( total_source=total_source, view=view, iqr_source=iqr_source, min_obs_slider=min_obs_slider, quality_slider=quality_slider, source_current_1=source_current_1, compound_select_1=compound_select_1, source_current_2=source_current_2, compound_select_2=compound_select_2, source_current_3=source_current_3, compound_select_3=compound_select_3, ), code=recompute_code, ) # Recompute should be triggered by filters (quality and min_obs) and user edits to the datatable quality_slider.js_on_change("value", recompute) min_obs_slider.js_on_change("value", recompute) total_source.js_on_change("patching", recompute) # Datapoints points = p.circle( x="new_name", y="rt", color="#F38630", size=4, alpha=0.5, source=total_source, view=view, ) tooltips = [ ("Original Name", "@hit_name"), ("Quality", "@quality"), ("File", "@origin"), ] hover_tool = HoverTool(renderers=[points], tooltips=tooltips) p.add_tools(hover_tool) crosshair_tool = CrosshairTool() p.add_tools(crosshair_tool) iqr_columns = [ TableColumn( field="new_name", title="Name", formatter=StringFormatter(font_style="bold"), ), TableColumn( field="lower", title="Lower Whisker", ), TableColumn( field="q1", title="Q1", ), TableColumn( field="q2", title="Q2", ), TableColumn( field="q3", title="Q3", ), TableColumn( field="upper", title="Upper Whisker", ), ] iqr_table = DataTable( source=iqr_source, columns=iqr_columns, editable=False, width=width, index_position=-1, index_header="Index", index_width=60, ) filename_input_iqr = TextInput( value="exported_iqr.txt", title="" ) # Export quartile data to:") download_button_iqr = Button( label="Export Tab Delimited File", button_type="success" ) download_button_iqr.js_on_click( CustomJS( # https://github.com/surfaceowl-ai/python_visualizations/blob/main/notebooks/bokeh_save_export_data.py args=dict(iqr_source=iqr_source, filename_input=filename_input_iqr), code=""" function table_to_csv(source) { const columns = ['new_name', 'lower', 'q1', 'q2', 'q3', 'upper']; const nrows = source.get_length(); var lines = ["Name\tLower Whisker\tQ1\tQ2\tQ3\tUpper Whisker"]; for (let i = 0; i < source.get_length(); i++) { lines += '\\n'; for (let j = 0; j < columns.length; j++) { lines += source.data[columns[j]][i].toString(); if (j < columns.length-1) { lines += '\\t'; } } } return lines; } var file = new Blob([table_to_csv(iqr_source)], {type: 'text/plain'}); var elem = window.document.createElement('a'); elem.href = window.URL.createObjectURL(file); elem.download = filename_input.value; document.body.appendChild(elem); elem.click(); document.body.removeChild(elem); """, ) ) filename_input = TextInput( value="saved_state.bt", title="" ) # Export to (must have .bt suffix):") radio = RadioButtonGroup( labels=["Selected Points Only", "Entire Table"], active=0 ) download_button = Button( label="Save Current State to Tab Delimited File", button_type="success" ) # CSV gets confused with commas in hit_names download_button.js_on_click( CustomJS( # https://github.com/surfaceowl-ai/python_visualizations/blob/main/notebooks/bokeh_save_export_data.py args=dict( total_source=total_source, filename_input=filename_input, radio=radio, min_obs_slider=min_obs_slider, quality_slider=quality_slider, ), code=""" var inds; if (radio.active == 0) { // Only selected points inds = total_source.selected.indices; } else { // All points in the table inds = []; for (let i=0; i < total_source.get_length(); ++i) { inds.push(i); } } function table_to_csv(source) { const columns = ['rt', 'quality', 'hit_name', 'suggested_name', 'new_name', 'hit_number', 'mol_weight', 'absolute_height', 'baseline_height', 'peak_width', 'area', 'origin', 'compound_number', 'library', 'entry_number_library', 'scan_number', 'flag', 'hash']; const nrows = source.get_length(); var lines = 'Minimum Observations within Quality Range\t' + min_obs_slider.value.toString() + '\tQuality Range\t' + quality_slider.value[0].toString() + '\t' + quality_slider.value[1].toString(); lines += ['\\nRetention Time\tQuality\tOriginal Name\tSuggested Name\tAssigned Name\tHit Number\tMolecular Weight\tAbsolute Height\tBaseline Height\tPeak Width\tArea\tFilename\tCompound Number\tLibrary\tLibrary Number\tScan Number\tFlag\tHash']; for (let i = 0; i < inds.length; i++) { lines += '\\n'; for (let j = 0; j < columns.length; j++) { lines += source.data[columns[j]][inds[i]].toString() if (j < columns.length-1) { lines += '\\t'; } } } return lines; } var file = new Blob([table_to_csv(total_source)], {type: 'text/plain'}); var elem = window.document.createElement('a'); elem.href = window.URL.createObjectURL(file); elem.download = filename_input.value; document.body.appendChild(elem); elem.click(); document.body.removeChild(elem); """, ) ) filename_import = FileInput(accept=".bt,") import_button = Button(label="Load Previous State", button_type="danger") import_button.js_on_click( CustomJS( args=dict( source=total_source, filename_import=filename_import, quality_slider=quality_slider, min_obs_slider=min_obs_slider, ), code=""" var data = atob(filename_import.value); // Convert from base64 var lines = data.split('\\n'); // These edits will trigger new visible data min_obs_slider.value = parseInt(lines[0].split('\\t')[1]); quality_slider.value = [parseInt(lines[0].split('\\t')[3]), parseInt(lines[0].split('\\t')[4])]; for (let i = 1; i < lines.length; ++i) { let row = lines[i].split('\\t'); let hash = row[row.length - 1]; let index = -1; // Try to find entry in total source and update it (based on hash) for (let j = 0; j < source.get_length(); ++j) { if (hash === source.data.hash[j]) { index = j; break; } } if (index >= 0) { source.data.rt[index] = parseFloat(row[0]); source.data.quality[index] = parseInt(row[1]); source.data.hit_name[index] = row[2]; source.data.suggested_name[index] = row[3]; source.data.new_name[index] = row[4]; source.data.hit_number[index] = parseInt(row[5]); source.data.mol_weight[index] = parseFloat(row[6]); source.data.absolute_height[index] = parseInt(row[7]); source.data.baseline_height[index] = parseInt(row[8]); source.data.peak_width[index] = parseFloat(row[9]); source.data.area[index] = parseInt(row[10]); source.data.origin[index] = row[11]; source.data.compound_number[index] = parseInt(row[12]); source.data.library[index] = row[13]; source.data.entry_number_library[index] = parseInt(row[14]); source.data.scan_number[index] = parseInt(row[15]); source.data.flag[index] = row[16] } } source.change.emit(); """, ) ) import_button.js_on_click( recompute ) # Also trigger a re-compute after loading the data p.xaxis.major_label_orientation = "vertical" output_file( filename=output_filename, title="Interactive Retention Time Summary" ) save( layout( [ [ filename_import, import_button, filename_input, radio, download_button, ], [data_table], [min_obs_slider, quality_slider], [p], [k_select_1, k_select_2, k_select_3], [compound_select_1, compound_select_2, compound_select_3], [compare_1, compare_2, compare_3], [alt_k_select_1, alt_k_select_2, alt_k_select_3], [ alt_compound_select_1, alt_compound_select_2, alt_compound_select_3, ], [alt_compare_1, alt_compare_2, alt_compare_3], [filename_input_iqr, download_button_iqr], [iqr_table], ], sizing_mode="scale_width", ) )