/************************************************************************
 * VSI4/VSIB Real-Time Multichannel Spectrometer
 * Copyright (C) 2009 Jan Wagner
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA
 **************************************************************************/

#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>

#include <time.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <signal.h>
#include <malloc.h>

#include <pthread.h>

#include <math.h> 
#include <rfftw.h>
#include <fftw.h>

#include <ipps.h>

#include "vsib_ioctl.h"

///////////////////////////////////////////////////////////////////////////////////////////////////////
// SHARED
///////////////////////////////////////////////////////////////////////////////////////////////////////

sig_atomic_t ctrl_C = 0;
unsigned char* rawbuf;
Ipp32f** integratedChs;

///////////////////////////////////////////////////////////////////////////////////////////////////////
// SIGNALS
///////////////////////////////////////////////////////////////////////////////////////////////////////

void SIGINT_handler(int signum) {
   printf("Halting...\n");
   ctrl_C = 1;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////
// VSIB
///////////////////////////////////////////////////////////////////////////////////////////////////////

/**
 * Protected error-checking ioctl() for the VSIB char device
 */
void vsib_ioctl(int fd, unsigned int mode, unsigned long arg) 
{
    if (ioctl(fd, mode, arg)) {
        char err[255];
        snprintf(err, sizeof(err), "rtspectrum: ioctl(vsib_fileno=%d, mode=0x%04x,...)", fd, mode);
        perror(err);
        fprintf(stderr, "rtspectrum: standard I/O is not an VSIB board\n");
        exit(EXIT_FAILURE);
     }
}

/**
 * Initialize VSIB using specified settings
 */
void vsib_start(int fd, int mode, int giga, int embed_1pps, int skip)
{ 
   printf("VSIB: mode %d, skip %d\n", mode, skip);
   vsib_ioctl(fd, VSIB_SET_MODE,
             (VSIB_MODE_MODE(mode)
              | VSIB_MODE_RUN
              | (giga ? VSIB_MODE_GIGABIT : 0)
              | (embed_1pps ? VSIB_MODE_EMBED_1PPS_MARKERS : 0)
              | (skip & 0x0000ffff))
            );
}

/**
 * Open VSIB
 */
int vsib_open(const char* path)
{
    return open(path, O_RDONLY);
}

/**
 * Close VSIB
 */
void vsib_close(int fd)
{
    close(fd);
}

/**
 * Safe read from VSIB
 */
ssize_t vsib_read(int fd, void* buf, size_t len)
{
    ssize_t nread = read(fd, buf, len);
    if (nread < 0) {
        printf("Error reading /dev/vsib : %s\n", strerror(errno));
        return -1;
    }
    while (nread < len) {
        ssize_t nr;
        usleep(1000);
        nr = read (fd, buf+nread, len-nread);
        if (nr < 0) {
            printf("Error reading /dev/vsib : %s\n", strerror(errno));
            return -1;
        }
        nread += nr;
    } 
    return nread;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////
// UNPACK-INTEGRATE AND FFT
///////////////////////////////////////////////////////////////////////////////////////////////////////

/**
 * Handle the 16-channel 2-bit format. Input data is 32 bits wide. We take only
 * one byte out of this, that is, process only 4 channels. No window-overlap
 * is used here.
 */
size_t unpack_integrate_2bit_4x16ch(unsigned char* in, size_t len, Ipp32f** out, int byte_nr)
{
    static Ipp32f precooked_LUT[256][4] __attribute__ ((aligned (8))); // 4kB
    static int precook_done = 0;
    size_t unpack_iterations;
    int ch, byte, i;

    if (!precook_done) {
        const Ipp32f  map[4] = { 1.0, 3.3359, -1.0, -3.3359 }; // {s,m} : 00,01,10,11 : direct sign/mag bits
        const Ipp32f  map_rev[4] = { 1.0, -1.0, 3.3359, -3.3359 }; // {m,s} : 00,01,10,11 : direct sign/mag bits
        for (byte=0; byte<256; byte++) {
            for (ch=0; ch<4; ch++) {
                int shift = 2*(4-ch-1);
                precooked_LUT[byte][ch] = map_rev[(byte>>shift) & 0x03];
            }
        }
        precook_done = 1;
        (void)map;
        (void)map_rev;
    }

    byte_nr %= 4;
    unpack_iterations = len / 1024;
    Ipp32f *out0 = out[0];
    Ipp32f *out1 = out[1];
    Ipp32f *out2 = out[2];
    Ipp32f *out3 = out[3];
    while (unpack_iterations--) {
        for (i=0; i<1024; i++) {
            unsigned char sample = *(in+byte_nr);
            in += 4;
            *out0 += precooked_LUT[sample][0]; out0++;
            *out1 += precooked_LUT[sample][1]; out1++;
            *out2 += precooked_LUT[sample][2]; out2++;
            *out3 += precooked_LUT[sample][3]; out3++;
        }
    }
    return len;
}

/**
 * Handle the 8-channel 2-bit format. Input data is 16 bits wide. We take only
 * one byte out of this, that is, process only 4 channels. No window-overlap
 * is used here.
 */
size_t unpack_integrate_2bit_4x8ch(unsigned char* in, size_t len, Ipp32f** out, int byte_nr)
{
    static Ipp32f precooked_LUT[256][4] __attribute__ ((aligned (8))); // 4kB
    static int precook_done = 0;
    size_t unpack_iterations;
    int ch, byte, i;

    if (!precook_done) {
        const Ipp32f  map[4] = { 1.0, 3.3359, -1.0, -3.3359 }; // {s,m} : 00,01,10,11 : direct sign/mag bits
        const Ipp32f  map_rev[4] = { 1.0, -1.0, 3.3359, -3.3359 }; // {m,s} : 00,01,10,11 : direct sign/mag bits
        for (byte=0; byte<256; byte++) {
            for (ch=0; ch<4; ch++) {
                int shift = 2*(4-ch-1);
                precooked_LUT[byte][ch] = map_rev[(byte>>shift) & 0x03];
            }
        }
        precook_done = 1;
        (void)map;
        (void)map_rev;
    }

    byte_nr %= 2;
    unpack_iterations = len / 1024;
    Ipp32f *out0 = out[0];
    Ipp32f *out1 = out[1];
    Ipp32f *out2 = out[2];
    Ipp32f *out3 = out[3];
    while (unpack_iterations--) {
        for (i=0; i<1024; i++) {
            unsigned char sample = *(in+byte_nr);
            in += 2;
            *out0 += precooked_LUT[sample][0]; out0++;
            *out1 += precooked_LUT[sample][1]; out1++;
            *out2 += precooked_LUT[sample][2]; out2++;
            *out3 += precooked_LUT[sample][3]; out3++;
        }
    }
    return len;
}

/**
 * Handle the 8-channel 2-bit format. Input data is 16 bits wide.
 * All 8 channels are processed. No window-overlap is used here.
 */
size_t unpack_integrate_2bit_8x8ch(unsigned char* in, size_t len, Ipp32f** out)
{
    static Ipp32f precooked_LUT[256][4] __attribute__ ((aligned (8))); // 4kB
    static int precook_done = 0;
    size_t unpack_iterations;
    int ch, byte, i;

    if (!precook_done) {
        const Ipp32f  map[4] = { 1.0, 3.3359, -1.0, -3.3359 }; // {s,m} : 00,01,10,11 : direct sign/mag bits
        const Ipp32f  map_rev[4] = { 1.0, -1.0, 3.3359, -3.3359 }; // {m,s} : 00,01,10,11 : direct sign/mag bits
        for (byte=0; byte<256; byte++) {
            for (ch=0; ch<4; ch++) {
                int shift = 2*(4-ch-1);
                precooked_LUT[byte][ch] = map_rev[(byte>>shift) & 0x03];
            }
        }
        precook_done = 1;
        (void)map;
        (void)map_rev;
    }

    unpack_iterations = len / 1024;
    Ipp32f *out0 = out[0];
    Ipp32f *out1 = out[1];
    Ipp32f *out2 = out[2];
    Ipp32f *out3 = out[3];
    Ipp32f *out4 = out[3];
    Ipp32f *out5 = out[3];
    Ipp32f *out6 = out[3];
    Ipp32f *out7 = out[3];
    while (unpack_iterations--) {
        for (i=0; i<1024; i++) {
            unsigned char sample = *(in+0);
            *out0 += precooked_LUT[sample][0]; out0++;
            *out1 += precooked_LUT[sample][1]; out1++;
            *out2 += precooked_LUT[sample][2]; out2++;
            *out3 += precooked_LUT[sample][3]; out3++;
            sample = *(in+1);
            *out4 += precooked_LUT[sample][0]; out4++;
            *out5 += precooked_LUT[sample][1]; out5++;
            *out6 += precooked_LUT[sample][2]; out6++;
            *out7 += precooked_LUT[sample][3]; out7++;
            in += 2;
        }
    }
    return len;
}

/**
 * Handle results from the integrated 4-channel results.
 * The four channels are FFT transformed.
 */
size_t fft_channels(Ipp32f** inout, size_t fftlen, int numprocessedchannels)
{
   static int initialized = 0;
   static IppsFFTSpec_R_32f* fftspec;
   static Ipp8u* fftbuf;
   int i;

   if (!initialized) {
       int size;
       IppStatus s;
       s = ippsFFTInitAlloc_R_32f(&fftspec, log2((double)fftlen), IPP_FFT_DIV_FWD_BY_N, ippAlgHintAccurate);
       s = ippsFFTGetBufSize_R_32f(fftspec, &size);
       fftbuf = memalign(128, size);
       initialized = 1;
   }

   for (i=0; i<numprocessedchannels; i++) {
       ippsFFTFwd_RToPerm_32f_I(inout[i], fftspec, fftbuf);
       // 2) extract phase cal signal (might be tricky as FFT 2^N are not even at Hertz)
   }
   return 0;
}

/**
 * Handle the 4-channel 2-bit format. Input data is 8 bits wide.
 * No window-overlap is used here.
 */
size_t unpack_integrate_2bit_4x4ch(unsigned char* in, size_t len, Ipp32f** out)
{
    static Ipp32f precooked_LUT[256][4]; // 4kB
    static int precook_done = 0;
    size_t unpack_iterations;
    int ch, byte, i;

    if (!precook_done) {
        const Ipp32f  map[4] = { 1.0, 3.3359, -1.0, -3.3359 }; // {s,m} : 00,01,10,11 : direct sign/mag bits
        const Ipp32f  map_rev[4] = { 1.0, -1.0, 3.3359, -3.3359 }; // {m,s} : 00,01,10,11 : direct sign/mag bits
        for (byte=0; byte<256; byte++) {
            for (ch=0; ch<4; ch++) {
                int shift = 2*(4-ch-1);
                precooked_LUT[byte][ch] = map_rev[(byte>>shift) & 0x03];
            }
        }
        precook_done = 1;
        (void)map;
        (void)map_rev;
    }

    unpack_iterations = len / 1024;
    Ipp32f *out0 = out[0];
    Ipp32f *out1 = out[1];
    Ipp32f *out2 = out[2];
    Ipp32f *out3 = out[3];
    while (unpack_iterations--) {
        for (i=0; i<1024; i++) {
            unsigned char sample = *in;
            in++;
            *out0 += precooked_LUT[sample][0]; out0++;
            *out1 += precooked_LUT[sample][1]; out1++;
            *out2 += precooked_LUT[sample][2]; out2++;
            *out3 += precooked_LUT[sample][3]; out3++;
        }
    }
    return len;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////
// MAIN
///////////////////////////////////////////////////////////////////////////////////////////////////////

int main(int argc, char **argv)
{
    size_t fftlen = 0;
    float samplerate = 0.0f;
    float inttime = 0.0f;
    size_t fftstacklen = 0;
    int vsib_fd = 0;
    struct timeval tv_start, tv_stop;
    struct sigaction sa;
    double usec;
    int numchannels;
    int numprocessedchannels;
    int outfiles[16];
    int i;

    if (argc != 5) {
        printf("\nUsage: vsi4spec <fftlen> <samplerate> <inttime> <4|8|16>\n"
               "  where fftlen is the length of the FFT, at least 1024 points\n"
               "        samplerate is the sampling rate of each channel in Hertz\n"
               "        inttime is the integration time in seconds\n"
               "        4|8|16 is the number of channels to capture\n\n");
        return -1;
    }

    /* Args */
    fftlen = atoi(argv[1]);
    if ((fftlen%1024) != 0) { fftlen = 1024 * (size_t)(fftlen/1024); }
	samplerate = atof(argv[2]);
    inttime = atof(argv[3]);
    numchannels = atoi(argv[4]);
    if (numchannels!=4 && numchannels!=8 && numchannels!=16) { numchannels=4; }

    /* Derive vars */
    fftlen = pow(2.0, (int)log2(fftlen));
    fftstacklen = (size_t)(samplerate * inttime / fftlen);
    inttime = ((double)fftstacklen)*((double)fftlen)/samplerate;
    printf("Integrating %ld of %ld-point FFTs which corresponds to %f seconds\n",
           (long)fftstacklen, (long)fftlen, inttime);

    /* Allocate */
    rawbuf = memalign(4096, fftlen*(numchannels/4));
    integratedChs = (Ipp32f**)malloc(sizeof(Ipp32f*) * numchannels);
    for (i=0; i<numchannels; i++) {
        integratedChs[i] = (Ipp32f*)memalign(16, sizeof(Ipp32f) * fftlen);
        ippsZero_32f(integratedChs[i], fftlen);
    }

    /* Revector Control-C (SIGINT) handling */
    sigemptyset (&sa.sa_mask);
    sa.sa_flags = 0;
    sa.sa_handler = SIGINT_handler;
    sigaction (SIGINT, &sa, 0);

    /* Start VSIB capture */
    vsib_fd = vsib_open("/dev/vsib");
    if (vsib_fd < 0) {
        printf("Error opening /dev/vsib : %s\n", strerror(errno));
        return -1;
    }
    switch (numchannels) {
        // mode,giga,1pps,skip
        case 4:
           vsib_start(vsib_fd, /*low8bits*/0x03, 0, 0, 0); 
           numprocessedchannels = numchannels;
           break;
        case 8:
           vsib_start(vsib_fd, /*low16bits*/0x02, 0, 0, 0); 
           numprocessedchannels = 8; //NOTE choose 4 if the cpu overloads
           break;
        case 16:
           vsib_start(vsib_fd, /*all32bits*/0x00, 0, 0, 1); // 1024Mb/s PCI prob!
           numprocessedchannels = 4;
           break;
        default:
           numprocessedchannels = 4;
    }

    /* Open files */
    for (i=0; i<numprocessedchannels; i++) {
        char filename[255];
        snprintf(filename, sizeof(filename)-1, "vsi4spec_%dk_ch%d.bin", fftlen/1024, i+1);
        outfiles[i] = creat(filename, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH);
    }

    /* Process indefinitely */
    while (!ctrl_C) {

        /* Integrate */
        gettimeofday(&tv_start, NULL);
        size_t blocksleft = fftstacklen;
        while (blocksleft-- > 0) {
            ssize_t nr = vsib_read(vsib_fd, rawbuf, fftlen*(numchannels/4));
            if (nr < 0) {
                printf("Error reading /dev/vsib : %s\n", strerror(errno));
                return -1;
            }
            switch (numchannels) {
               case 4:
                   unpack_integrate_2bit_4x4ch(rawbuf, fftlen, integratedChs);
                   break;
               case 8:
                   if (numprocessedchannels == 4) {
                       unpack_integrate_2bit_4x8ch(rawbuf, fftlen, integratedChs, /*bytenr 0..1*/ 0);
                   } else {
                       unpack_integrate_2bit_8x8ch(rawbuf, fftlen, integratedChs);
                   }
                   break;
               case 16:
                   unpack_integrate_2bit_4x16ch(rawbuf, fftlen, integratedChs, /*bytenr 0..1*/ 0);
                   break;
               default:
                   break;
            }
        }

        /* FFT */
        fft_channels(integratedChs, fftlen, numprocessedchannels);

        /* Write to file or network */
        for (i=0; i<numprocessedchannels; i++) {
            (void)write(outfiles[i], integratedChs[i], fftlen*sizeof(float));
        }

        /* Stats */
        gettimeofday(&tv_stop, NULL);
        usec = 1e6*(tv_stop.tv_sec - tv_start.tv_sec) + tv_stop.tv_usec - tv_start.tv_usec;
        printf("Exec time: %f seconds (%f Ms/s/ch * %d channels)\n", 
               1e-6*usec, ((double)fftstacklen*fftlen)/usec, numprocessedchannels);
    }
  
    /* Close all */
    vsib_close(vsib_fd);
    for (i=0; i<numprocessedchannels; i++) {
        close(outfiles[i]);
    }
    return 0;
}

