/* ratectl.c, bitrate control routines (linear quantization only currently) */

/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */

/*
 * Disclaimer of Warranty
 *
 * These software programs are available to the user without any license fee or
 * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
 * any and all warranties, whether express, implied, or statuary, including any
 * implied warranties or merchantability or of fitness for a particular
 * purpose.  In no event shall the copyright-holder be liable for any
 * incidental, punitive, or consequential damages of any kind whatsoever
 * arising from the use of these programs.
 *
 * This disclaimer of warranty extends to the user of these programs and user's
 * customers, employees, agents, transferees, successors, and assigns.
 *
 * The MPEG Software Simulation Group does not represent or warrant that the
 * programs furnished hereunder are free of infringement of any third-party
 * patents.
 *
 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
 * are subject to royalty fees to patent holders.  Many of these patents are
 * general enough such that they are unavoidable regardless of implementation
 * design.
 *
 */

#include <stdio.h>
#include <math.h>
#include <pthread.h>

#include "config.h"
#include "global.h"

/* rate control variables */
/*
 * static double R, T, d;
 * static double actsum;
 * static int Np, Nb;
 * static double S, Q;
 * static int prev_mquant;
 * static double bitcnt_EOP;
 * static double next_ip_delay; // due to frame reordering delay
 * static double decoding_time;
 * static int Xi, Xp, Xb, r, d0i, d0p, d0b;
 * static double avg_act;
 */

void ratectl_init_seq(ratectl_t *ratectl)
{
/*
 *   bitcnt_EOP = 0.0;
 *   next_ip_delay = 0.0;
 *   decoding_time = 0.0;
 *   P =                        0;  // P distance between complete intra slice refresh 
 *   r =                        0;  // rate control: r (reaction parameter) 
 *   avg_act =                  0;  // rate control: avg_act (initial average activity) 
 *   Xi =                       0;  // rate control: Xi (initial I frame global complexity measure) 
 *   Xp =                       0;  // rate control: Xp (initial P frame global complexity measure) 
 *   Xb =                       0;  // rate control: Xb (initial B frame global complexity measure) 
 *   d0i =                      0;  // rate control: d0i (initial I frame virtual buffer fullness) 
 *   d0p =                      0;  // rate control: d0p (initial P frame virtual buffer fullness) 
 *   d0b =                      0;  // rate control: d0b (initial B frame virtual buffer fullness) 
 */

	pthread_mutexattr_t mutex_attr;
	pthread_mutexattr_init(&mutex_attr);
	pthread_mutex_init(&(ratectl->ratectl_lock), &mutex_attr);

/* reaction parameter (constant) */
	if (ratectl->r == 0)  ratectl->r = (int)floor(2.0 * bit_rate / frame_rate + 0.5);

/* average activity */
	if (ratectl->avg_act == 0.0)  ratectl->avg_act = 400.0;

/* remaining # of bits in GOP */
	ratectl->R = 0;

/* global complexity measure */
	ratectl->Xi = 0;
	ratectl->Xp = 0;
	ratectl->Xb = 0;
	if(ratectl->Xi == 0) ratectl->Xi = (int)floor(160.0 * bit_rate / 115.0 + 0.5);
	if(ratectl->Xp == 0) ratectl->Xp = (int)floor( 60.0 * bit_rate / 115.0 + 0.5);
	if(ratectl->Xb == 0) ratectl->Xb = (int)floor( 42.0 * bit_rate / 115.0 + 0.5);

/* virtual buffer fullness */
	ratectl->d0i = 0;
	ratectl->d0p = 0;
	ratectl->d0b = 0;
	if(ratectl->d0i == 0) ratectl->d0i = (int)floor(10.0 * ratectl->r / 31.0 + 0.5);
	if(ratectl->d0p == 0) ratectl->d0p = (int)floor(10.0 * ratectl->r / 31.0 + 0.5);
	if(ratectl->d0b == 0) ratectl->d0b = (int)floor(1.4 * 10.0 * ratectl->r / 31.0 + 0.5);


/*
 * 
 * 	fprintf(statfile,"\nrate control: sequence initialization\n");
 * 	fprintf(statfile,
 *       " initial global complexity measures (I,P,B): Xi=%d, Xp=%d, Xb=%d\n",
 *       Xi, Xp, Xb);
 * 	fprintf(statfile," reaction parameter: r=%d\n", r);
 * 	fprintf(statfile,
 *       " initial virtual buffer fullness (I,P,B): d0i=%d, d0p=%d, d0b=%d\n",
 *       d0i, d0p, d0b);
 * 	fprintf(statfile," initial average activity: avg_act=%.1f\n", avg_act);
 */
}

void ratectl_init_GOP(ratectl_t *ratectl, int np, int nb)
{
	ratectl->R += floor((1 + np + nb) * bit_rate / frame_rate + 0.5);
	ratectl->Np = fieldpic ? 2 * np + 1 : np;
	ratectl->Nb = fieldpic ? 2 * nb : nb;
/*
 * 
 *   fprintf(statfile,"\nrate control: new group of pictures (GOP)\n");
 *   fprintf(statfile," target number of bits for GOP: R=%f\n",R);
 *   fprintf(statfile," number of P pictures in GOP: Np=%d\n",Np);
 *   fprintf(statfile," number of B pictures in GOP: Nb=%d\n",Nb);
 */
}

#ifdef HAVE_MMX

inline void mmx_start_var_sblk()
{
	asm("
		.align 8
			pxor %%mm7, %%mm7;     // s
			pxor %%mm6, %%mm6;     // s
			pxor %%mm5, %%mm5;     // s2
			pxor %%mm4, %%mm4;     // s2
		" 
		: 
		: );
}

inline void mmx_var_sblk(unsigned char *p)
{
	asm("
		.align 8
		movq         (%%ebx),       %%mm0;      // Load entire row
		punpcklbw    %%mm0,         %%mm1;      // Unpack to higher precision for addition
		paddw        %%mm0,         %%mm7;      // Accumulate into s
		pmullw       %%mm0,         %%mm0;      // Get square for s2
		paddw        %%mm1,         %%mm6;      // Accumulate into s
		pmullw       %%mm1,         %%mm1;      // Get square for s2
		paddw        %%mm0,         %%mm5;      // Accumulate into s2
		paddw        %%mm1,         %%mm4;      // Accumulate into s2
		" 
		: 
		: "b" (p));
}

static unsigned long  MMX_ACCUM_AND2[]         = {0xffffffff, 0x00000000};

inline unsigned int mmx_end_var_sblk()
{
	unsigned long long r = 0;
	asm("
		.align 8
		pxor            %%mm3,  %%mm3;         // Clear temp for unpacking
		movq            %%mm7,  %%mm2;         // Make a copy for unpacking
		punpcklwd       %%mm3,  %%mm2;         // Unpack lower 2 pixels for accumulation
		punpckhwd       %%mm3,  %%mm7;         // Unpack high 2 pixels for accumulation
 		paddw           %%mm6,  %%mm7;         // Add 2 doublewords in each register
 		movq            %%mm7,  %%mm6;         // Copy the result for a final add
 		pand            MMX_ACCUM_AND2, %%mm7;  // And the result for accumulation
 		psrlq           $32,    %%mm6;         // Shift the copy right for accumulation
 		paddd           %%mm6,  %%mm7;         // Add the results
 		movq            %%mm7,  (%%ebx);       // Store result
		emms;
		"
		: :  "b" (&r));

	return (unsigned int)r;
}

#endif

/* compute variance of 8x8 block */
static double var_sblk(p, lx)
unsigned char *p;
int lx;
{
	int j;
	register unsigned int v, s, s2;

	s = s2 = 0;

	for (j=0; j<8; j++)
	{
		v = p[0];   s += v;    s2 += v * v;
		v = p[1];   s += v;    s2 += v * v;
		v = p[2];   s += v;    s2 += v * v;
		v = p[3];   s += v;    s2 += v * v;
		v = p[4];   s += v;    s2 += v * v;
		v = p[5];   s += v;    s2 += v * v;
		v = p[6];   s += v;    s2 += v * v;
		v = p[7];   s += v;    s2 += v * v;
		p += lx;
	}

	return (double)s2 / 64.0 - ((double)s / 64.0) * ((double)s / 64.0);
}


static void calc_actj(frame)
unsigned char *frame;
{
	int i, j, k;
	unsigned char *p;
	double actj, var;

	k = 0;

	for (j = 0; j < height2; j += 16)
    	for (i = 0; i < width; i += 16)
    	{
    		p = frame + ((pict_struct == BOTTOM_FIELD) ? width : 0) + i + width2 * j;

/* take minimum spatial activity measure of luminance blocks */
    		actj = var_sblk(p, width2);
    		var = var_sblk(p + 8, width2);
    		if(var < actj) actj = var;
    		var = var_sblk(p + 8 * width2, width2);
    		if(var < actj) actj = var;
    		var = var_sblk(p + 8 * width2 + 8, width2);
    		if(var < actj) actj = var;

    		if(!fieldpic && !prog_seq)
    		{
/* field */
        		var = var_sblk(p, width << 1);
        		if (var < actj) actj = var;
        		var = var_sblk(p + 8, width << 1);
        		if (var < actj) actj = var;
        		var = var_sblk(p + width, width << 1);
        		if (var < actj) actj = var;
        		var = var_sblk(p + width + 8, width << 1);
        		if (var < actj) actj = var;
    		}

    		actj += 1.0;

    		mbinfo[k++].act = actj;
    	}
}

/* Note: we need to substitute K for the 1.4 and 1.0 constants -- this can
   be modified to fit image content */

/* Step 1: compute target bits for current picture being coded */
void ratectl_init_pict(ratectl_t *ratectl, unsigned char *frame)
{
    double Tmin;

	switch(pict_type)
	{
		case I_TYPE:
    		ratectl->T = floor(ratectl->R / (1.0 + ratectl->Np * ratectl->Xp / (ratectl->Xi * 1.0) + ratectl->Nb * ratectl->Xb / (ratectl->Xi * 1.4)) + 0.5);
    		ratectl->d = ratectl->d0i;
    		break;
		case P_TYPE:
    		ratectl->T = floor(ratectl->R / (ratectl->Np + ratectl->Nb * 1.0 * ratectl->Xb / (1.4 * ratectl->Xp)) + 0.5);
    		ratectl->d = ratectl->d0p;
    		break;
		case B_TYPE:
    		ratectl->T = floor(ratectl->R / (ratectl->Nb + ratectl->Np * 1.4 * ratectl->Xp / (1.0 * ratectl->Xb)) + 0.5);
    		ratectl->d = ratectl->d0b;
    		break;
	}

	Tmin = floor(bit_rate / (8.0 * frame_rate) + 0.5);

	if(ratectl->T < Tmin)
    	ratectl->T = Tmin;

	ratectl->S = bitcount();
	ratectl->Q = 0;

	calc_actj(frame);

	ratectl->actsum = 0.0;

/*
 * 	fprintf(statfile,"\nrate control: start of picture\n");
 * 	fprintf(statfile," target number of bits: T=%f\n",T);
 */
}

/* compute initial quantization stepsize (at the beginning of picture) */
int ratectl_start_mb(ratectl_t *ratectl)
{
	int mquant;
	double Qj;

	Qj = ratectl->d * 31.0 / ratectl->r;
	if(fixed_mquant) Qj = fixed_mquant;

	if(q_scale_type)
	{
    	mquant = (int) floor(2.0 * Qj + 0.5);

/* clip mquant to legal (linear) range */
    	if(mquant < 1)
    	    mquant = 1;
    	if(mquant > 112)
    	    mquant = 112;

/* map to legal quantization level */
    	mquant = non_linear_mquant_table[map_non_linear_mquant[mquant]];
	}
	else
	{
    	mquant = (int) floor(Qj + 0.5);
    	mquant <<= 1;

/* clip mquant to legal (linear) range */
    	if (mquant<2)
    	  mquant = 2;
    	if (mquant>62)
    	  mquant = 62;

    	ratectl->prev_mquant = mquant;
	}

	return mquant;
}

void ratectl_update_pict(ratectl_t *ratectl)
{
	double X, P, percent, minPercent;
	unsigned int i;
	if(fixed_mquant) return;

	minPercent = 0.25;

	P = bitcount() - ratectl->S; /* total # of bits in picture */
	percent = P / ratectl->T;
	{
    	ratectl->S = P;
    	P = 0.0;
	}

	ratectl->R -= ratectl->S; /* remaining # of bits in GOP */
	X = floor(ratectl->S * ((0.5 * (double)ratectl->Q) / (mb_width * mb_height2)) + 0.5);
	ratectl->d += ratectl->S - ratectl->T;
	ratectl->avg_act = ratectl->actsum / (mb_width * mb_height2);

	switch (pict_type)
	{
		case I_TYPE:
    		ratectl->Xi = X;
    		ratectl->d0i = ratectl->d;
    		break;
		case P_TYPE:
    		ratectl->Xp = X;
    		ratectl->d0p = ratectl->d;
    		ratectl->Np--;
    		break;
		case B_TYPE:
    		ratectl->Xb = X;
    		ratectl->d0b = ratectl->d;
    		ratectl->Nb--;
    		break;
	}

/*
 *   fprintf(statfile,"\nrate control: end of picture\n");
 *   fprintf(statfile," actual number of bits: S=%f\n",S);
 *   fprintf(statfile," average quantization parameter Q=%.1f\n",
 *     (double)Q / (mb_width * mb_height2));
 *   fprintf(statfile," remaining number of bits in GOP: R=%f\n",R);
 *   fprintf(statfile,
 *     " global complexity measures (I,P,B): Xi=%d, Xp=%d, Xb=%d\n",
 *     Xi, Xp, Xb);
 *   fprintf(statfile,
 *     " virtual buffer fullness (I,P,B): d0i=%d, d0p=%d, d0b=%d\n",
 *     d0i, d0p, d0b);
 *   fprintf(statfile," remaining number of P pictures in GOP: Np=%d\n",Np);
 *   fprintf(statfile," remaining number of B pictures in GOP: Nb=%d\n",Nb);
 *   fprintf(statfile," average activity: avg_act=%.1f\n", avg_act);
 */
}

/* Step 2: measure virtual buffer - estimated buffer discrepancy */
int ratectl_calc_mquant(ratectl_t *ratectl, int j)
{
	int mquant;
	double dj, Qj, actj, N_actj;

	pthread_mutex_lock(&(ratectl->ratectl_lock));
	actj = mbinfo[j].act;
	ratectl->actsum += actj;


/* measure virtual buffer discrepancy from uniform distribution model */
    dj = ratectl->d + (bitcount() - ratectl->S) - j * (ratectl->T / (mb_width * mb_height2));

/* scale against dynamic range of mquant and the bits/picture count */
    Qj = dj * 31.0 / ratectl->r;

	if(fixed_mquant) Qj = fixed_mquant;

/* compute normalized activity */
  	N_actj = (2.0 * actj + ratectl->avg_act) / (actj + 2.0 * ratectl->avg_act);

	if(q_scale_type)
	{
/* modulate mquant with combined buffer and local activity measures */
    	  mquant = (int)floor(2.0 * Qj * N_actj + 0.5);

/* clip mquant to legal (linear) range */
    	  if(mquant < 1)
    		  mquant = 1;
    	  if(mquant > 112)
    		  mquant = 112;

/* map to legal quantization level */
    	  mquant = non_linear_mquant_table[map_non_linear_mquant[mquant]];
	}
	else
	{
/* modulate mquant with combined buffer and local activity measures */
    	mquant = (int) floor(Qj * N_actj + 0.5);
    	mquant <<= 1;

/* clip mquant to legal (linear) range */
    	if(mquant < 2)
    		mquant = 2;
    	if(mquant > 62)
    		mquant = 62;

/* ignore small changes in mquant */
    	if(mquant >= 8 && (mquant - ratectl->prev_mquant) >= -4 && (mquant - ratectl->prev_mquant) <= 4)
    		mquant = ratectl->prev_mquant;

    	ratectl->prev_mquant = mquant;
	}

	ratectl->Q += mquant; /* for calculation of average mquant */
	pthread_mutex_unlock(&(ratectl->ratectl_lock));

	return mquant;
}

/* VBV calculations
 *
 * generates warnings if underflow or overflow occurs
 */

/* vbv_end_of_picture
 *
 * - has to be called directly after writing picture_data()
 * - needed for accurate VBV buffer overflow calculation
 * - assumes there is no byte stuffing prior to the next start code
 */

void vbv_end_of_picture()
{
}

/* calc_vbv_delay
 *
 * has to be called directly after writing the picture start code, the
 * reference point for vbv_delay
 */

void calc_vbv_delay()
{
}

void stop_ratectl(ratectl_t *ratectl)
{
	pthread_mutex_destroy(&(ratectl->ratectl_lock));
}
