õpik on kadunud

Soo, eelmise teemaga seonduvaltfailid arendasin hajusaid vektoreid raamatu järgi, mida ma enam ei leia internetist. Kus on õpikud, kui neid vaja on, eh?

Tegemist biti vektoritega, milleks saab muuta kõiksugu vektoreid, kus üks kindel bit tähendab “tühjust”. Näiteks ASCII tühik DBase v2 andmebaasis. Ja idee on pakkimises, mitte random accessis.

Igaljuhul kujunes sellest 2x suurem teek kui see andmebaas ise on ja ei soovitaks ka kellelegi ilma olemasoleva implementatsioonita neid bitte taga ajada.

Aga tulemused on head tundub. Uniform distribution( jah ma ei mäleta, mis see eestikeeles oli) tõenäosusega 0,5 pakkis 2x väiksemaks. Teksti puhul on see suht tihe tekst muide. 0,2 oli umbes 4x jne.

Lisaks sain teada, miks paljud implementatsioonid kasutavad ainult vektoreid, mis on 8-aga jagatavad – sest muidu lihtsalt ei saa midagi aru. Ja tõesti ainult äärmiselt suur fuzz test andis need äärmised juhud. Rohkem testida ei viitsi ka.
Image and video hosting by TinyPic
Jup, armastan ka infograafikat. Selline oli siis tihe tekst.

Tegelikult peab olema igas tehnoloogia postituses graafikuid. Muidu ei saa wiki järgi häkkerid tekstist aru. Jup.

Selle koodi juures oli huvitav, et ma ei saanud mitte midagi koodist aru juba siis, kui ma seda kirjutasin. Tavaliselt võtab kuid aega, enne kui bittwiddling koodist midagi aru ei saa. Õnneks olid mingid testid, mille järgi sai kontrollida koodi õigsust. Ma ei ole tegelikult üldse mures, et midagi aru ei saa, või et kood võiks elegantsem olla. Rohkem mures, et ei leia originaal algoritmi kirjeldust üles, mis oli tegelikult lihtne algoritm. Kuid kuna ma nibble-dega selliselt pole kokku puutunud, siis panin muutujaid lihtsalt juurde koguaeg, või võtsin vähemaks. C kood on ka võõras. Seotud link bitarray.

Postitan koodi ka, et postitus pikem oleks, eh. Ma teen koodis päris palju calloc-e ja malloc-e, huvitav, et mingit kiiruse probleemi pole küll märganud. sparse_string võib tulla suurem kui originaal ka muide, see pole tema mure.

//sparse string.h

typedef struct sparse_string
{
	char * bitstream_deleted1;
	char * bitstream_deleted2;
	char * bitstream_deleted3;
	int bitstream_deleted1_size;//sizes in bytes
	int bitstream_deleted2_size;
	int bitstream_deleted3_size;
	int bitstream2_size;
	int bitstream_size;
	char * str;//the trimmed string
	int str_size;//the original string size
	int c_str_size; //trimmed string size

}sparse_string;

#define substring_size 4

int get_size(sparse_string str);

sparse_string encode_str(char * str, int str_size, char frequent_char);

char * decode_str(sparse_string str, char frequent_char);

sparse_string encode(char * bitstream, char * bitstream2, int bitstream_size, int bitstream2_size);

char * decode(sparse_string encoded);

char * str_to_bitstream(char * str, int str_len, int * out_bitstream_len, char ** out_bitstream2, int * out_bitstream2_len, char frequent_char);

int fuzzy_test();
int fuzzy_test2();






//----------------------------
//sparse string.c
#include "stdafx.h"



int fuzzy_test2()
{
	sparse_string sparse;
	float max_compression;
	int r;
	int max_len;
	int min_len;
	int max_char;
	int is_neg;
	int sparse_size;
	int probability;
	char * str;
	char * str_ret;
	char ch;
	char ch2;
	int nr_runs;
	int i, j;
	max_compression = 0;
	max_char = 126 - 41;
	max_len = 1000;
	min_len = 1000000;
	nr_runs = 1;
	probability = 2;
	srand( (unsigned)time( NULL ) );
	r = 1 + rand() % max_len;
	
	while( -- nr_runs >= 0)
	{
		r = 100000000;//(min_len + rand() % max_len) * 8;
		str = calloc(r, sizeof(char));
		for(i = 0; i < r ; i++)
		{
			is_neg = rand() % 2; 
			if( (rand() % probability) == 0)
			{
				str[i] = 41 + rand() % max_char;
			}
			else
				str[i] = ' ';
		}
		sparse = encode_str(str, r, ' ');
		sparse_size = get_size(sparse);
		if(max_compression < ((float)r / (float)sparse_size))
			max_compression = ((float)r / (float)sparse_size);
		str_ret = decode_str(sparse, ' ');
		for(i = 0; i < r; i++)
		{
			ch = str_ret[i];
			ch2 = str[i];
			if(str_ret[i] != str[i])
			{
				printf("failed to return same string");
			}
			
		}
		if(sparse_size > r)
		{
				printf("failed to compress and resulted in expansion with string of size = ");
				printf("%d", sparse_size);
				printf("\n");
		}
		free(str);
		free(str_ret);
		free(sparse.bitstream_deleted1);
		free(sparse.bitstream_deleted2);
		free(sparse.bitstream_deleted3);
		free(sparse.str);
	}
	printf("Max compression with these settings is: ");
	printf("%f", max_compression);
	return 0;
}

int get_size(sparse_string sparse)
{
	return (sparse.bitstream_deleted1_size + sparse.bitstream_deleted2_size  + sparse.bitstream_deleted3_size + sparse.c_str_size + 44);//+44 is size of the sparse_string structure as it uses ints
}
char * str_to_bitstream(char * str, int str_len, int * out_bitstream_len, char ** out_bitstream2, int * out_bitstream2_len, char frequent_char)
{
	char c[substring_size];
	char * original_bitstream;
	char * bitstream;
	int str_size;
	int original_bitstream_size;
	int compressed_part_size;
	int bitstream_size = 0;
	int i = 0;
	int j = 0;
	int k = 0;
	int l = 0;
	int m = 0;
	int n = 0;
	str_size = str_len;
	compressed_part_size = str_size - (str_size % substring_size);
	original_bitstream_size = str_size / 8;
	bitstream_size = str_size / substring_size;
	if(bitstream_size % 8 != 0)
		bitstream_size = (bitstream_size / 8)+1;
	else
		bitstream_size = bitstream_size / 8;
	original_bitstream = (char*) calloc(original_bitstream_size, sizeof(char));
	*out_bitstream2  = calloc(bitstream_size, sizeof(char));
	
	for(i = 0; i < compressed_part_size; i++)
	{
		if(str[i] == frequent_char)
		{
			c[k] = 0;
			original_bitstream[m] &= ~(1 << n++);
		}
		else
		{
			c[k] = 1;
			original_bitstream[m] |= (1 << n++);
		}
		
		if(n == 8)
		{
			n = 0;
			m++;
		}
		c[0] |= c[k++];
		 
		if(k == substring_size)
		{
			k = 0;
			if(c[0] == 0)
			{
				*(*out_bitstream2+j) &= ~(1 << l++);
			}
			else
				*(*out_bitstream2+j) |= 1 << l++;
			memset(c, 0, sizeof(char) * substring_size);
			if(l == 8)
				l = 0;
		}
		if( (i % (8 * substring_size) ) == 0 && i !=0)
			j++;
	}
	* out_bitstream_len = original_bitstream_size;
	* out_bitstream2_len = bitstream_size;
	//out_bitstream2 = bitstream;
	return original_bitstream;
}

char * decode_str(sparse_string str, char frequent_char)
{
	//this is the original bitvector with ones where letters should be
	//needs to be freed later
	char * decoded_str;
	char * dest2;
	char d;
	char check;
	int i = 0;
	int j = 0;
	char* dest = decode(str);
	dest2 = &(*dest);
	decoded_str = calloc(str.str_size, sizeof(char));
	for(i = 0; i < str.str_size; i++)
	{
		if( (i % 8) == 0 && i != 0)
			dest++;
		d = *dest;
		if( d & (1 << (i % 8) ))
		{
			decoded_str[i] = str.str[j++];
		}
		else
		{
			decoded_str[i] = frequent_char;
		}
		//printf("%c", decoded_str[i]);
	}
	free(dest2);
	return decoded_str;
}

sparse_string encode(char * in_bitstream, char * bitstream2, int in_bitstream_size, int in_bitstream2_size)
{
	char c[substring_size];
	sparse_string encoded;
	char * original_bitstream = in_bitstream;
	char * bitstream = bitstream2;
	char * bitstream_deleted;
	char * original_bitstream_deleted;
	char * bitstream_level2; // L3
	int bitstream_level2_size;
	int i = 0;
	int j = 0;
	int k = 0;
	int l = 0;
	int m = 0;
	int n = 0;
	int mask1 = 0xf0;
	int mask2 = 0x0f;
	int bitstream_deleted_sizeb = 0;
	int original_bitstream_deleted_sizeb = 0;
	int original_bitstream_size = in_bitstream_size;
	int bitstream_size = in_bitstream2_size;
	
	bitstream_level2_size = bitstream_size / substring_size;
	if( (bitstream_size % substring_size) != 0)
		bitstream_level2_size = (bitstream_size / substring_size) +1;

	memset(c, 0, sizeof(char) * substring_size);
	bitstream_level2 = (char*)calloc(bitstream_level2_size, sizeof(char));
	//assumes 4 substringsize
	while(i < bitstream_size)
	{
		do
		{
			int an = (bitstream[i] & (1 << j)) >=1 ? 1: 0;
			c[0] |= an << j ;
			j++;
		}
		while( (j) % 4 != 0);
		if(j == 8)
			j = 0;
		if(c[0] == 0)
		{
			bitstream_level2[l] &= ~(1 << k++); //11011
		}
		else
		{
			bitstream_level2[l] |= ( 1 << k++);
		}
		c[0] = 0;
		if(k == 8)
		{
			k =0;
			l++;
		}
		if( (k  % 2) == 0)
			i++;
	}

	//delete zeros
	
	k = 0;
	j = 0;
	l = 0;
	bitstream_deleted = (char*)calloc(bitstream_size, sizeof(char));
	c[0] = 0;
	for(i = 0; i < bitstream_size * 2; i++)
	{
		if( ( (bitstream[i/2] & mask2) && (i % 2 == 0 ) )|| ( (bitstream[i/2] & mask1) && (i % 2 != 0) ) )
		{
			if(i % 2 == 0)
			{
				//start of byte in bitstream
				if(j == 0)
				{

					//start of byte in deleted_stream
					bitstream_deleted[l] |= bitstream[i/2] & mask2;
					j++;
				}
				else
				{
					//middle of byte in deleted stream
					bitstream_deleted[l] |= ( (bitstream[i/2] & mask2) << 4);
					j++;
				}
				
			}
			else
			{
				//middle of byte in bitstream
				if(j == 0)
				{
					//start of byte in deleted stream
					bitstream_deleted[l] |=  ( (bitstream[i/2] & (mask2 << 4)) >> 4 );
					j++;
				}
				else
				{
					//middle of byte in deleted stream
					bitstream_deleted[l] |= bitstream[i/2] & (mask2 << 4);
					j++;
				}
			}
		}
		if( j == 2)
		{
			bitstream_deleted_sizeb++;
			l++;
			j =0;
		}
	}
	bitstream_deleted_sizeb += 1;//plus one for l = 0 case
	bitstream_deleted = (char*)realloc(bitstream_deleted, sizeof(char) * bitstream_deleted_sizeb);
	//
	
	
	k = 0;
	j = 0;
	l = 0;
	original_bitstream_deleted = (char*)calloc(original_bitstream_size, sizeof(char));
	c[0] = 0;
	for(i = 0; i < original_bitstream_size * 2; i++)
	{
		if( ( (original_bitstream[i/2] & mask2) && (i % 2 == 0 ) )|| ( (original_bitstream[i/2] & mask1) && (i % 2 != 0) ) )
		{
			if(i % 2 == 0)
			{
				//start of byte in bitstream
				if(j == 0)
				{

					//start of byte in deleted_stream
					original_bitstream_deleted[l] |= original_bitstream[i/2] & mask2;
					j++;
				}
				else
				{
					//middle of byte in deleted stream
					original_bitstream_deleted[l] |= ( (original_bitstream[i/2] & mask2) << 4);
					j++;
				}
				
			}
			else
			{
				//middle of byte in bitstream
				if(j == 0)
				{
					//start of byte in deleted stream
					original_bitstream_deleted[l] |=  ( (original_bitstream[i/2] & (mask2 << 4)) >> 4 );
					j++;
				}
				else
				{
					//middle of byte in deleted stream
					original_bitstream_deleted[l] |= original_bitstream[i/2] & (mask2 << 4);
					j++;
				}
			}
		}
		if( j == 2)
		{
			original_bitstream_deleted_sizeb++;
			l++;
			j =0;
		}
	}
	original_bitstream_deleted_sizeb += 1;//plus one for l = 0 case
	original_bitstream_deleted = (char*)realloc(original_bitstream_deleted, sizeof(char) * original_bitstream_deleted_sizeb);

	encoded.bitstream_deleted1 = original_bitstream_deleted;
	encoded.bitstream_deleted1_size = original_bitstream_deleted_sizeb;
	encoded.bitstream_deleted2 = bitstream_deleted;
	encoded.bitstream_deleted2_size = bitstream_deleted_sizeb;
	encoded.bitstream_deleted3  = bitstream_level2;
	encoded.bitstream_deleted3_size = bitstream_level2_size;
	encoded.bitstream2_size = bitstream_size;
	encoded.bitstream_size = original_bitstream_size;
	return encoded;
}

//you need to free the returned string if you use this function
char * decode(sparse_string encoded)
{
	char c[substring_size];

	int lo = 0x0f;
	int hi = 0xf0;
	char * original_bitstream_deleted;
	char * bitstream_deleted;
	char * bitstream_level2;
	char * decoded;
	char * original_decoded;
	int original_bitstream_deleted_sizeb;
	int bitstream_deleted_sizeb;
	int bitstream_level2_size;
	int bitstream_size;
	int original_bitstream_size;
	int i = 0;
	int j = 0;
	int k = 0;
	int l = 0;
	int m = 0;
	int n = 0;
	bitstream_size = encoded.bitstream2_size;
	original_bitstream_size = encoded.bitstream_size;
	original_bitstream_deleted = encoded.bitstream_deleted1;
	bitstream_deleted = encoded.bitstream_deleted2;
	bitstream_level2 = encoded.bitstream_deleted3;
	original_bitstream_deleted_sizeb = encoded.bitstream_deleted1_size;
	bitstream_deleted_sizeb = encoded.bitstream_deleted2_size;
	bitstream_level2_size = encoded.bitstream_deleted3_size;
	

	decoded = (char*) calloc(bitstream_size, sizeof(char));
	
	j = 0;
	i = 0;
	k = 0;
	l = 0;
	n = 0;
    
	for(i = 0;  i < bitstream_level2_size * 8; i++)
	{
			if(bitstream_level2[i/8] & (1 << j++))
			{
				if(l % 2 == 0)
				{
					//deleted is start
					if(k % 2 == 0)
					{
						//k is start
						decoded[k/2] |= bitstream_deleted[l/2] & lo;
					}
					else
					{
						// k is middle
						decoded[k/2] |= ( (bitstream_deleted[l/2] & lo) << 4);
					}
				}
				else
				{
					//deleted is middle
					if(k % 2 == 0)
					{
						//k is start
						decoded[k/2] |= ( (bitstream_deleted[l/2] & hi) >> 4 );
					}
					else
					{
						// k is middle
						decoded[k/2] |= bitstream_deleted[l/2] & hi;
					}
				}
			}
			else
			{
				l--;
			}
		if(j == 8)
			j=0;
		l++;
		k++;
	}
	
	original_decoded = (char*)calloc(original_bitstream_size, sizeof(char));
	j = 0;
	i = 0;
	k = 0;
	l = 0;
	n = 0;
    
	for(i = 0;  i < bitstream_size * 8; i++)
	{
		if(decoded[i/8] & (1 << j++))
			{
				if(l % 2 == 0)
				{
					//deleted is start
					if(k % 2 == 0)
					{
						//k is start
						original_decoded[k/2] |= original_bitstream_deleted[l/2] & lo;
					}
					else
					{
						// k is middle
						original_decoded[k/2] |= ( (original_bitstream_deleted[l/2] & lo) << 4);
					}
				}
				else
				{
					//deleted is middle
					if(k % 2 == 0)
					{
						//k is start
						original_decoded[k/2] |= ( (original_bitstream_deleted[l/2] & hi) >> 4 );
					}
					else
					{
						// k is middle
						original_decoded[k/2] |= original_bitstream_deleted[l/2] & hi;
					}
				}
			}
			else
			{
				l--;
			}
		if(j == 8)
			j=0;
		l++;
		k++;
	}
	free(decoded);

	return original_decoded;
}

//you need to free all strings in returned sparse_string struct if you use this function, it does not store/modify the original entered string
// I would not dare to use any strings not divisible by 8 as input, so sparse.c_str_size is set to 0 in that case (it should break decoding the string also if unnoticed) 
sparse_string encode_str(char * str, int str_size, char frequent_char)
{
	sparse_string sparse;
	char * compressed_str;
	char * f;
	int bs_sz = 0;
	char * bs2 = NULL;
	int bs2sz = 0;
	int i;
	int j;//compressed_size
	if( (str_size % 8) != 0)
	{
		sparse.c_str_size = 0;
	}
	f = str_to_bitstream(str, str_size, &bs_sz, &bs2, &bs2sz, frequent_char);
	sparse = encode(f, bs2, bs_sz, bs2sz);
	compressed_str = malloc(sizeof(char) * str_size);
	j = 0;
	for(i = 0; i < str_size; i++)
	{
		if(str[i] != frequent_char)
		{
			compressed_str[j++] = str[i]; 
		}
	}
	compressed_str = realloc(compressed_str, j);
	sparse.str = compressed_str;
	sparse.c_str_size = j;
	sparse.str_size = str_size;
	return sparse;
}


	
//----------------------------------------
//sparse_strings.c
//
//
//visual studio "stdafx.h" file
//#include <stdio.h>
//#include <tchar.h>
//#include <stddef.h>
//#include <stdlib.h>
//#include <string.h>
//#include <time.h>
//#include "sparse string.h"
#include "stdafx.h"



int _tmain(int argc, _TCHAR* argv[])
{
	// it's an example, does not compile on purpose.. read it, 
        //sparase_string has one huge test it passes plus some more cornercase tests, so that's encouraging, also I don't know how to code and am blind:p
	//also the trimmed string (it does not store/modify the original string) without the "frequent char" is in the sparse_string struct, everything else there is pretty useless
	char strin[1000];//divisible by 8 input...
	char* file ="c:\\codefile.cs";
	int real_len = 0;
	int i;
	FILE* fp = fopen(file, "r");
	fread(strin, sizeof(char),1000, fp);
	spa = encode_str(strin, 1000, ' ');
	real_len = get_size(spa);
	dest = decode_str(spa, ' ');
	for(i = 0; i < spa.str_size; i++)
	{
		printf("%c", dest[i]);
	}
	//free all strings in spa... do it yourself like written in function definitions (encode_str I think) also free the string returned by decode_str
	//free(...)free(...)...
	return 0;
}


Advertisements