Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

closes #255 matplotlib histogram of nearest neighbor energies #256

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 94 additions & 7 deletions nuad/np.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ def longest_common_substrings_singlea1(a1: np.ndarray, a2s: np.ndarray) \
idx_longest_raveled = np.argmax(counter_flat, axis=1)
len_longest = counter_flat[np.arange(counter_flat.shape[0]), idx_longest_raveled]

idx_longest = np.unravel_index(idx_longest_raveled, dims=(len_a1 + 1, len_a2 + 1))
idx_longest = np.unravel_index(idx_longest_raveled, shape=(len_a1 + 1, len_a2 + 1))
a1idx_longest = idx_longest[0] - len_longest
a2idx_longest = idx_longest[1] - len_longest

Expand Down Expand Up @@ -583,7 +583,7 @@ def _longest_common_substrings_pairs(a1s: np.ndarray, a2s: np.ndarray) \
idx_longest_raveled = np.argmax(counter_flat, axis=1)
len_longest = counter_flat[np.arange(counter_flat.shape[0]), idx_longest_raveled]

idx_longest = np.unravel_index(idx_longest_raveled, dims=(len_a1 + 1, len_a2 + 1))
idx_longest = np.unravel_index(idx_longest_raveled, shape=(len_a1 + 1, len_a2 + 1))
a1idx_longest = idx_longest[0] - len_longest
a2idx_longest = idx_longest[1] - len_longest

Expand Down Expand Up @@ -669,7 +669,7 @@ def _strongest_common_substrings_all_pairs(a1s: np.ndarray, a2s: np.ndarray, tem
len_strongest = counter_flat[np.arange(counter_flat.shape[0]), idx_strongest_raveled]
energy_strongest = energies_flat[np.arange(counter_flat.shape[0]), idx_strongest_raveled]

idx_strongest = np.unravel_index(idx_strongest_raveled, dims=(len_a1 + 1, len_a2 + 1))
idx_strongest = np.unravel_index(idx_strongest_raveled, shape=(len_a1 + 1, len_a2 + 1))
a1idx_strongest = idx_strongest[0] - len_strongest
a2idx_strongest = idx_strongest[1] - len_strongest

Expand Down Expand Up @@ -902,10 +902,6 @@ def write_to_file(self, filename: str) -> None:
for i in range(self.numseqs):
f.write(self.get_seq_str(i) + '\n')

def wcenergy(self, idx: int, temperature: float) -> float:
"""Return energy of idx'th sequence binding to its complement."""
return wcenergy(self.seqarr[idx], temperature)

def __repr__(self) -> str:
return 'DNASeqSet(seqs={})'.format(str([self[i] for i in range(self.numseqs)]))

Expand Down Expand Up @@ -1283,3 +1279,94 @@ def calculate_wc_energies(seqarr: np.ndarray, temperature: float, negate: bool =
def wc_arr(seqarr: np.ndarray) -> np.ndarray:
"""Return numpy array of reverse complements of sequences in `seqarr`."""
return (3 - seqarr)[:, ::-1]


def energy_hist(length: int | Iterable[int], temperature: float = 37,
combine_lengths: bool = False, show: bool = False,
num_random_sequences: int = 100_000,
figsize: Tuple[int, int] = (15, 6), **kwargs) -> None:
"""
Make a matplotlib histogram of the nearest-neighbor energies (as defined by
:meth:`DNASeqList.energies`) of all DNA sequences of the given length(s),
or a randomly selected subset if length(s) is too large to enumerate all DNA sequences
of that length.

This is useful, for example, to choose low and high energy values to pass to
:any:`NearestNeighborEnergyFilter`.

:param length:
length of DNA sequences to consider, or an iterable (e.g., list) of lengths
:param temperature:
temperature in Celsius
:param combine_lengths:
If True, then `length` should be an iterable, and the histogram will combine all calculated energies
from all lengths into one histogram to plot. If False (the default), then different lengths are
plotted in different colors in the histogram.
:param show:
If False, then the histogram plotted, but matplotlib.pyplot.show() is not called.
This allows one to tweak aspects of the histogram after it is plotted calling functions in
matplotlib.pyplot.
If True, then the histogram is displayed using matplotlib.pyplot.show().
This is convenient for making several histogram plots in a single notebook cell.
(Otherwise the several calls to matplotlib.pyplot.hist will overwrite each other.)
:param num_random_sequences:
If the length is too large to enumerate all DNA sequences of that length,
then this many random sequences are used to estimate the histogram.
:param figsize:
Size of the figure in inches.
:param kwargs:
Any keyword arguments given are passed along as keyword arguments to matplotlib.pyplot.hist:
https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html
"""
import matplotlib.pyplot as plt

if combine_lengths and isinstance(length, int):
raise ValueError(f'length must be an iterable if combine_lengths is False, '
f'but it is the int {length}')

plt.figure(figsize=figsize)
plt.xlabel(f'Nearest-neighbor energy (kcal/mol)')

lengths = [length] if isinstance(length, int) else length

alpha = 1
if len(lengths) > 1:
alpha = 0.5
if 'label' in kwargs:
raise ValueError(f'label (={kwargs["label"]}) '
f'should not be specified if multiple lengths are given')

bins = kwargs.pop('bins', 20)

all_energies = []
for length in lengths:
if length < 9:
seqs = DNASeqList(length=length)
else:
seqs = DNASeqList(length=length, num_random_seqs=num_random_sequences)
energies = seqs.energies(temperature=temperature)

if combine_lengths:
all_energies.extend(energies)
else:
label = kwargs['label'] if 'label' in kwargs else f'length {length}'
_ = plt.hist(energies, alpha=alpha, label=label, bins=bins, **kwargs)

if combine_lengths:
if 'label' in kwargs:
label = kwargs['label']
del kwargs['label']
else:
if len(lengths) == 1:
label = f'length {length}'
else:
lengths_delimited = ', '.join(map(str, lengths))
label = f'lengths {lengths_delimited} combined'
_ = plt.hist(all_energies, alpha=alpha, label=label, bins=bins, **kwargs)

plt.legend(loc='upper right')
title = kwargs.pop('title', f'Nearest-neighbor energies of DNA sequences at {temperature} C')
plt.title(title)

if show:
plt.show()
Loading