Browse Source

- added lookahead-functions

master
art1pirat 6 months ago
parent
commit
733d7a4fa1
1 changed files with 229 additions and 167 deletions
  1. +229
    -167
      src/ctx_compress.c

+ 229
- 167
src/ctx_compress.c View File

@@ -21,164 +21,106 @@
/* #include <sys\stat.h> */
#include <sys/stat.h>
#endif
char *fivechar_words[30] = {
" the ",
" and ",
"SDPC",
" the",
" or ",
" an ",
" to ",
" of ",
" is ",
" it ",
" in ",
" on ",
"ing ",
" th",
"nd ",
" yo",
". ",
"ed ",
" a ",
"er ",
"is ",
"or ",
"ng ",
"re ",
" pr",
"our",
"ou ",
" co",
"ion",
"ve ",
};

char lookahead_alpha (FILE * filehandle, unsigned long pos, char ch) {
char len = -1;
if (isalpha(ch)) {
char buf[255];
fread( &buf, sizeof(char), 255, filehandle);
for (int i=0; i<255; i++) {
if (buf[i]==ch) len++;
else {break;}
}
fseek(filehandle, pos+len, SEEK_SET ); /* resets filehandle */
}
return len;
}

char lookahead_bigrams (FILE * filehandle, unsigned long pos) {
char len = -1;
/* TODO */
return len;
}

char lookahead_fivechars (FILE * filehandle, unsigned long pos) {
char len = -1;
/* TODO */
return len;
}


int main (const int argc, const char* argv[]) {
const char *infilename = NULL;
if (argc != 2) {
fprintf(stderr, "expecting 1 argument, got %i\n", argc);
exit(EXIT_FAILURE);
}
infilename = argv[1];
fprintf(stderr, "ctx_compress, (c) by Andreas Romeyke, licensed under GPL3.0\n");
fprintf(stderr, "encoding file '%s':\n", infilename);
char truncated_infilename[13];
memset(&truncated_infilename, '\0', 13);
strncpy( truncated_infilename, infilename, 12);
if (strlen( infilename ) > 12) {
fprintf(stderr, "truncating filename '%s' to '%s'\n", infilename, truncated_infilename);
}
FILE * filehandle;
filehandle = fopen( infilename, "rb");
/* write header */
u_int8_t magic_bytes[6] = {
0x03, 0x43, 0x54, 0x30, 0x30, 0x31
};
fwrite(&magic_bytes, sizeof(u_int8_t), 6, stdout);
/* write truncated filename */
fwrite(&truncated_infilename, sizeof(char), strlen(truncated_infilename)+1, stdout);
/* read input */
struct stat st;
stat(infilename, &st);
size_t insize = st.st_size;
/* TODO: calc most frequent 5char words */
char *fivechar_words[30] = {
" the ",
" and ",
"SDPC",
" the",
" or ",
" an ",
" to ",
" of ",
" is ",
" it ",
" in ",
" on ",
"ing ",
" th",
"nd ",
" yo",
". ",
"ed ",
" a ",
"er ",
"is ",
"or ",
"ng ",
"re ",
" pr",
"our",
"ou ",
" co",
"ion",
"ve ",

};
/* TODO: calc most frequent bigrams of rest */
const unsigned char bigrams[254] =
{
0x65, 0x20,
0x20, 0x74,
0x74, 0x68,
0x73, 0x20,
0x20, 0x61,
0x68, 0x65,
0x74, 0x20,
0x69, 0x6e,
0x6e, 0x20,
0x64, 0x20,
0x65, 0x72,
0x72, 0x20,
0x20, 0x6f,
0x61, 0x6e,
0x72, 0x65,
0x6f, 0x75,
0x6f, 0x6e,
0x20, 0x69,
0x20, 0x73,
0x6f, 0x72,
0x79, 0x20,
0x20, 0x77,
0x74, 0x6f,
0x6f, 0x20,
0x2c, 0x20,
0x61, 0x72,
0x61, 0x74,
0x69, 0x73,
0x65, 0x73,
0x2e, 0x20,
0x6e, 0x64,
0x20, 0x63,
0x65, 0x6e,
0x74, 0x65,
0x20, 0x70,
0x20, 0x66,
0x76, 0x65,
0x68, 0x61,
0x6c, 0x65,
0x6e, 0x74,
0x73, 0x74,
0x69, 0x74,
0x72, 0x6f,
0x73, 0x65,
0x20, 0x6d,
0x66, 0x20,
0x20, 0x62,
0x6e, 0x67,
0x20, 0x79,
0x61, 0x6c,
0x65, 0x64,
0x6c, 0x6c,
0x74, 0x69,
0x72, 0x61,
0x79, 0x6f,
0x6f, 0x66,
0x6c, 0x20,
0x6d, 0x65,
0x75, 0x72,
0x65, 0x61,
0x20, 0x68,
0x20, 0x64,
0x61, 0x73,
0x61, 0x20,
0x68, 0x69,
const unsigned char bigrams[254] =
{
0x65, 0x20,
0x20, 0x74,
0x74, 0x68,
0x73, 0x20,
0x20, 0x61,
0x68, 0x65,
0x74, 0x20,
0x69, 0x6e,
0x6e, 0x20,
0x64, 0x20,
0x65, 0x72,
0x72, 0x20,
0x20, 0x6f,
0x61, 0x6e,
0x72, 0x65,
0x6f, 0x75,
0x6f, 0x6e,
0x20, 0x69,
0x20, 0x73,
0x6f, 0x72,
0x79, 0x20,
0x20, 0x77,
0x74, 0x6f,
0x6f, 0x20,
0x2c, 0x20,
0x61, 0x72,
0x61, 0x74,
0x69, 0x73,
0x65, 0x73,
0x2e, 0x20,
0x6e, 0x64,
0x20, 0x63,
0x65, 0x6e,
0x74, 0x65,
0x20, 0x70,
0x20, 0x66,
0x76, 0x65,
0x68, 0x61,
0x6c, 0x65,
0x6e, 0x74,
0x73, 0x74,
0x69, 0x74,
0x72, 0x6f,
0x73, 0x65,
0x20, 0x6d,
0x66, 0x20,
0x20, 0x62,
0x6e, 0x67,
0x20, 0x79,
0x61, 0x6c,
0x65, 0x64,
0x6c, 0x6c,
0x74, 0x69,
0x72, 0x61,
0x79, 0x6f,
0x6f, 0x66,
0x6c, 0x20,
0x6d, 0x65,
0x75, 0x72,
0x65, 0x61,
0x20, 0x68,
0x20, 0x64,
0x61, 0x73,
0x61, 0x20,
0x68, 0x69,
0x63, 0x6f,
0x67, 0x20,
0x6e, 0x65,
@@ -241,7 +183,106 @@ int main (const int argc, const char* argv[]) {
0x68, 0x6f,
0x49, 0x20,
0x72, 0x74
};

char lookahead_alpha (FILE * filehandle, unsigned long pos, char ch) {
char len = -1;
if (isprint(ch)) {
char buf[255];
fread( &buf, sizeof(char), 255, filehandle);
for (int i=0; i<255; i++) {
if (buf[i]==ch) len++;
else {
len+=2;
break;
}
}
fseek(filehandle, pos+len-1, SEEK_SET ); /* resets filehandle */
}
#ifdef DEBUG
fprintf(stderr, "debug alpha (pos=%i) '%c'x%i\n", ftell(filehandle), ch, len);
#endif
return len;
}

/* TODO: calc most frequent bigrams of rest */
char lookahead_bigrams (FILE * filehandle, unsigned long pos, char ch) {
char len = -1;
char buf[2];
buf[0]=ch;
buf[1]=fgetc(filehandle);
for (int i=0; i<127; i++) {
if (
(buf[0] == bigrams[2*i]) &&
(buf[1] == bigrams[2*i+1])
) {
len = i;
break;
}
}
if (len == -1) {
fseek(filehandle, pos, SEEK_SET ); /* resets filehandle */
}
#ifdef DEBUG
fprintf(stderr, "debug bigrams (pos=%i) buf='%c%c'\n", ftell(filehandle), buf[0], buf[1]);
#endif
return len;
}

/* TODO: calc most frequent 5char words */
char lookahead_fivechars (FILE * filehandle, unsigned long pos, char ch) {
char len = -1;
char buf[5];
buf[0]=ch;
char * bufp = buf; bufp++;
fread( bufp, sizeof(char), 4, filehandle);
for (int i=0; i<30; i++) {
if (strncmp(buf, fivechar_words[i], strlen(fivechar_words[i])) == 0) {
len = i;
fseek(filehandle, pos+strlen(fivechar_words[i])-1, SEEK_SET ); /* resets filehandle */
break;
}
}
if (len == -1) {
fseek(filehandle, pos, SEEK_SET ); /* resets filehandle */
}
#ifdef DEBUG
fprintf(stderr, "debug 5chars (pos=%i) buf='%c%c%c%c%c' %s (len=%i)\n", ftell(filehandle), buf[0], buf[1], buf[2], buf[3], buf[4], (len==-1?"false":"true"), len);
#endif
return len;
}


int main (const int argc, const char* argv[]) {
const char *infilename = NULL;
if (argc != 2) {
fprintf(stderr, "expecting 1 argument, got %i\n", argc);
exit(EXIT_FAILURE);
}
infilename = argv[1];
fprintf(stderr, "ctx_compress, (c) by Andreas Romeyke, licensed under GPL3.0\n");
fprintf(stderr, "encoding file '%s':\n", infilename);
char truncated_infilename[13];
memset(&truncated_infilename, '\0', 13);
strncpy( truncated_infilename, infilename, 12);
#ifdef DEBUG
if (strlen( infilename ) > 12) {
fprintf(stderr, "truncating filename '%s' to '%s'\n", infilename, truncated_infilename);
}
#endif
FILE * filehandle;
filehandle = fopen( infilename, "rb");
/* write header */
u_int8_t magic_bytes[6] = {
0x03, 0x43, 0x54, 0x30, 0x30, 0x31
};
fwrite(&magic_bytes, sizeof(u_int8_t), 6, stdout);
/* write truncated filename */
fwrite(&truncated_infilename, sizeof(char), strlen(truncated_infilename)+1, stdout);
/* read input */
struct stat st;
stat(infilename, &st);
size_t insize = st.st_size;



@@ -249,41 +290,62 @@ int main (const int argc, const char* argv[]) {
for (int i=0; i<30; i++) {
int len = strlen( fivechar_words[i]);
if (len < 5) {len=len+1;}
/* DEBUG: fprintf(stderr, "5w[%i]='%s' len=%i\n", i, fivechar_words[i], len);*/
#ifdef DEBUG
fprintf(stderr, "5w[%i]='%s' len=%i\n", i, fivechar_words[i], len);
#endif
fwrite(fivechar_words[i], sizeof(char), len, stdout);
}
/* write fixed two-character sequence */
fwrite(&bigrams, sizeof(char), 254, stdout);
/* write compressed data */
while (!feof(filehandle)) {
long unsigned i = ftell(filehandle);
/* read char */
char in_c = fgetc(filehandle);
#ifdef DEBUG
fprintf(stderr, "debug fgetc consume (pos=%i)\n", ftell(filehandle));
#endif
int ch = fgetc(filehandle);
long unsigned i = ftell(filehandle);
char in_c;
if (ch == EOF) {
break;
} else {
in_c = ch;
}
#ifdef DEBUG
fprintf(stderr, "debug fgetc consumed '%c' (pos=%i)\n", in_c, ftell(filehandle));
#endif
char out_c;
char lookahead;
if (in_c == '\n') {
out_c=0x13;
#ifdef DEBUG
fprintf(stderr, "debug newline (pos=%i)\n", i);
#endif
out_c=0x0d;
fputc(out_c, stdout);
}
else if (in_c == '\r') { /* do nothing */}
else if (lookahead = lookahead_fivechars(filehandle, i) > -1) {
if (lookahead <10) out_c=lookahead;
else if (lookahead < 12) out_c=lookahead+1;
else out_c=lookahead+2;
else if ((lookahead = lookahead_fivechars(filehandle, i, in_c)) > -1) {
if (lookahead <10) out_c=lookahead; /* 0-9 -> 0-9 */
else if (lookahead < 12) out_c=lookahead+1; /* 10-11 -> 11-12 */
else if (lookahead >= 12)out_c=lookahead+2; /* output 12- -> 14- */
fputc(out_c, stdout);
}
else if (lookahead = lookahead_bigrams(filehandle, i) > -1) {
else if ((lookahead = lookahead_bigrams(filehandle, i, in_c)) > -1) {
out_c=lookahead+128;
fputc(out_c, stdout);
}
else if (lookahead = lookahead_alpha(filehandle, i, in_c) > -1) {
else if ((lookahead = lookahead_alpha(filehandle, i, in_c)) > -1) {
fputc(255, stdout);
fputc(30+lookahead, stdout);
fputc(in_c, stdout);
}
else {
#ifdef DEBUG
fprintf(stderr, "debug else (pos=%i)\n", i);
#endif
fputc(255, stdout);
fputc(in_c, stdout);
}
}
}


Loading…
Cancel
Save