Source code for setigen.split_utils

import sys
import os
import errno
import numpy as np
from blimpy import read_header, Waterfall


[docs]def split_fil_generator(fil_fn, f_window, f_shift=None):
    """
    Creates a generator that returns smaller Waterfall objects by 'splitting'
    an input filterbank file according to the number of frequency samples.

    Since this function only loads in data in chunks according to f_window,
    it handles very large observations well. Specifically, it will not attempt
    to load all the data into memory before splitting, which won't work when
    the data is very large anyway.

    Parameters
    ----------
    fil_fn : str
        Filterbank filename with .fil extension
    f_window : int
        Number of frequency samples per new filterbank file
    f_shift : int, optional
        Number of samples to shift when splitting filterbank. If
        None, defaults to `f_shift=f_window` so that there is no
        overlap between new filterbank files

    Returns
    -------
    split : Waterfall
        A blimpy Waterfall object containing a smaller section of the data
    """

    fch1 = read_header(fil_fn)[b'fch1']
    nchans = read_header(fil_fn)[b'nchans']
    df = read_header(fil_fn)[b'foff']

    if f_shift is None:
        f_shift = f_window

    # Note that df is negative!
    f_start = fch1 + f_window * df
    f_stop = fch1

    # Iterates down frequencies, starting from highest
    while f_start >= fch1 + nchans * df:
        split_fil = Waterfall(fil_fn, f_start=f_start, f_stop=f_stop)
        
        # Fix some header values
        split_fil.header[b'fch1'] = split_fil.file_header[b'fch1'] = f_stop
        split_fil.header[b'nchans'] = split_fil.file_header[b'nchans'] = f_window
        
        yield split_fil

        f_start += f_shift * df
        f_stop += f_shift * df


[docs]def split_fil(fil_fn, output_dir, f_window, f_shift=None):
    """
    Creates a set of new filterbank files by 'splitting' an input filterbank
    file according to the number of frequency samples.

    Parameters
    ----------
    fil_fn : str
        Filterbank filename with .fil extension
    output_dir : str
        Directory for new filterbank files
    f_window : int
        Number of frequency samples per new filterbank file
    f_shift : int, optional
        Number of samples to shift when splitting filterbank. If
        None, defaults to `f_shift=f_window` so that there is no
        overlap between new filterbank files

    Returns
    -------
    split_fns : list of str
        List of new filenames
    """
    if output_dir[-1] != '/':
        output_dir = output_dir + '/'

    try:
        os.makedirs(output_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    split_generator = split_fil_generator(fil_fn,
                                          f_window,
                                          f_shift=f_shift)

    # Iterates down frequencies, starting from highest
    split_fns = []
    for i, split_fil in enumerate(split_generator):
        output_fn = output_dir + '%s_%04d.fil' % (f_window, i)
        split_fil.write_to_fil(output_fn)
        split_fns.append(output_fn)
        print('Saved %s' % output_fn)
    return split_fns


[docs]def split_array(data, f_sample_num=None, t_sample_num=None,
                f_shift=None, t_shift=None,
                f_trim=False, t_trim=False):
    """
    Splits NumPy arrays into a list of smaller arrays according to limits in
    frequency and time. This doesn't reduce/combine data, it simply cuts the
    data into smaller chunks.

    Parameters
    ----------
    data : ndarray
        Time-frequency data

    Returns
    -------
    split_data : list of ndarray
        List of new time-frequency data frames
    """

    split_data = []

    if not isinstance(data, np.ndarray):
        sys.exit("Input data must be a NumPy array!")

    height, width = data.shape

    if f_sample_num is None:
        f_sample_num = width
    if t_sample_num is None:
        t_sample_num = height

    if f_shift is None:
        f_shift = f_sample_num
    elif f_shift <= 0:
        sys.exit("Invalid x-direction shift!")

    if t_shift is None:
        t_shift = t_sample_num
    elif t_shift <= 0:
        sys.exit("Invalid y-direction shift!")

    # Save first frame, regardless of overstepping bounds
    y_start = 0
    y_stop = min(t_sample_num, height)
    x_start = 0
    x_stop = min(f_sample_num, width)
    split_data.append(data[y_start:y_stop, x_start:x_stop])
    y_in_bound = (y_stop < height)
    x_in_bound = (x_stop < width)

    # As long as either bound is valid, continue adding frames
    while y_in_bound or x_in_bound:

        # Shift frames in the x direction
        while x_in_bound:
            x_start = x_start + f_shift
            x_stop = min(x_stop + f_shift, width)
            split_data.append(data[y_start:y_stop, x_start:x_stop])
            x_in_bound = (x_stop < width)

        # Break when both y and x are out of bounds
        if not y_in_bound:
            break

        # Shift frames in the y direction and reset x indices
        y_start = y_start + t_shift
        y_stop = min(y_stop + t_shift, height)
        x_start = 0
        x_stop = min(f_sample_num, width)
        split_data.append(data[y_start:y_stop, x_start:x_stop])
        y_in_bound = (y_stop < height)
        x_in_bound = (x_stop < width)

    # Filter out frames that aren't the same specied size
    if t_trim:
        split_data = list(filter(lambda A: A.shape[0] == t_sample_num,
                                 split_data))
    if f_trim:
        split_data = list(filter(lambda A: A.shape[1] == f_sample_num,
                                 split_data))
    return split_data