Skip to content

Commit

Permalink
Added -C option: always create a 'Complete' index file,
Browse files Browse the repository at this point in the history
ignoring possible decompression errors: this is useful to have the
total size of the file, even when it hasn't been correctly gzip-closed
(for example with rotated gzipped logs)
  • Loading branch information
circulosmeos committed Sep 11, 2019
1 parent fb86f5d commit b735dfc
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 32 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,14 @@ Copy gztool.c to the directory where you compiled zlib, and do:
Usage
=====

gztool (v0.7.2)
gztool (v0.8)
GZIP files indexer and data retriever.
Create small indexes for gzipped files and use them
for quick and random positioned data extraction.
No more waiting when the end of a 10 GiB gzip is needed!
//github.com/circulosmeos/gztool (by Roberto S. Galende)

$ gztool [-b #] [-s #] [-v #] [-cdeEfFhilStTW] [-I <INDEX>] <FILE>...
$ gztool [-b #] [-s #] [-v #] [-cCdeEfFhilStTW] [-I <INDEX>] <FILE>...

Note that actions `-bStT` proceed to an index file creation (if
none exists) INTERLEAVED with data extraction. As extraction and
Expand All @@ -91,6 +91,7 @@ Usage
-b #: extract data from indicated uncompressed byte position of
gzip file (creating or reusing an index file) to STDOUT.
Accepts '0', '0x', and suffixes 'kmgtpe' (^10) 'KMGTPE' (^2).
-C: always create a 'Complete' index file, ignoring possible errors
-c: utility: raw-gzip-compress indicated file to STDOUT
-d: utility: raw-gzip-decompress indicated file to STDOUT
-e: if multiple files are indicated, continue on error (if any)
Expand Down Expand Up @@ -262,7 +263,7 @@ Other tools which try to provide random access to gzipped files
Version
=======

This version is **v0.7.2**.
This version is **v0.8**.

Please, read the *Disclaimer*. This is still a beta release. In case of any errors, please open an *Issue*.

Expand Down
85 changes: 56 additions & 29 deletions gztool.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
//
// LICENSE:
//
// v0.1 to v0.7* by Roberto S. Galende, 2019
// v0.1 to v0.8* by Roberto S. Galende, 2019
// //github.com/circulosmeos/gztool
// A work by Roberto S. Galende
// distributed under the same License terms covering
Expand Down Expand Up @@ -148,7 +148,7 @@

#define local static

#define GZTOOL_VERSION "0.7.2"
#define GZTOOL_VERSION "0.8"

#define SPAN 10485760L /* desired distance between access points */
#define WINSIZE 32768U /* sliding window size */
Expand Down Expand Up @@ -877,7 +877,9 @@ int check_index_file( struct access *index, unsigned char *file_name, unsigned c
// is not (actually) used later in the app. (Windows could be also emptied, but again,
// index points are not used later.)
// int end_on_first_proper_gzip_eof : end file processing on first proper (at feof()) GZIP EOF
// (to be used when the file contains surely only one gzip stream)
// (to be used when the file contains surely only one gzip stream)
// int always_create_a_complete_index : create a 'complete' index file even in case of decompressing errors.
// Also an index pointer (**built) is returned, instead of NULL.
// OUTPUT:
// struct returned_output: contains two values:
// .error: Z_* error code or Z_OK if everything was ok
Expand All @@ -886,7 +888,7 @@ local struct returned_output build_index(
FILE *in, unsigned char *file_name, off_t span, struct access **built,
enum INDEX_AND_EXTRACTION_OPTIONS indx_n_extraction_opts, off_t offset,
unsigned char *index_filename, int write_index_to_disk,
int end_on_first_proper_gzip_eof )
int end_on_first_proper_gzip_eof, int always_create_a_complete_index )
{
struct returned_output ret;
off_t totin = 0; /* our own total counters to avoid 4GB limit */
Expand Down Expand Up @@ -1665,9 +1667,17 @@ local struct returned_output build_index(
if ( output_data_counter > 0 )
printToStderr( VERBOSITY_NORMAL, "%ld bytes of data extracted.\n", output_data_counter );
(void)inflateEnd(&strm);
if (index != NULL)
free_index(index);
*built = NULL;
if ( always_create_a_complete_index == 1 ) {
index->file_size = totout; /* size of uncompressed file (useful for bgzip files) */
// return index pointer and write index to index file, ignoring the decompression error
*built = index;
if ( ! serialize_index_to_file( index_file, index, index->have ) )
printToStderr( VERBOSITY_NORMAL, "ERROR whilst writing index file '%s'.\n", index_file );
} else {
if (index != NULL)
free_index(index);
*built = NULL;
}
if (index_file != NULL)
fclose(index_file);
return ret;
Expand Down Expand Up @@ -2053,13 +2063,15 @@ local int decompress_file(FILE *source, FILE *dest)
// 0: will just read, but do not write nor update (overwrite) it
// int end_on_first_proper_gzip_eof: end file processing on first proper (at feof()) GZIP EOF
// (to be used when the file contains surely only one gzip stream)
// int always_create_a_complete_index : create a 'complete' index file even in case of decompressing errors.
// Also an index pointer (**built) is returned, instead of NULL.
// OUTPUT:
// EXIT_* error code or EXIT_OK on success
local int action_create_index(
unsigned char *file_name, struct access **index,
unsigned char *index_filename, enum INDEX_AND_EXTRACTION_OPTIONS indx_n_extraction_opts,
off_t offset, off_t span_between_points, int write_index_to_disk,
int end_on_first_proper_gzip_eof )
int end_on_first_proper_gzip_eof, int always_create_a_complete_index )
{

FILE *in;
Expand Down Expand Up @@ -2149,7 +2161,7 @@ local int action_create_index(

ret = build_index( in, file_name, span_between_points, index,
indx_n_extraction_opts, offset, index_filename, write_index_to_disk,
end_on_first_proper_gzip_eof );
end_on_first_proper_gzip_eof, always_create_a_complete_index );
fclose(in);

if ( ret.error < 0 ) {
Expand Down Expand Up @@ -2467,7 +2479,7 @@ local void print_brief_help() {
fprintf( stderr, " Create small indexes for gzipped files and use them\n" );
fprintf( stderr, " for quick and random positioned data extraction.\n" );
fprintf( stderr, " //github.com/circulosmeos/gztool (by Roberto S. Galende)\n\n" );
fprintf( stderr, " $ gztool [-b #] [-s #] [-v #] [-cdeEfFhilStTW] [-I <INDEX>] <FILE>...\n\n" );
fprintf( stderr, " $ gztool [-b #] [-s #] [-v #] [-cCdeEfFhilStTW] [-I <INDEX>] <FILE>...\n\n" );
fprintf( stderr, " `gztool -hh` for more help\n" );
fprintf( stderr, "\n" );

Expand All @@ -2484,7 +2496,7 @@ local void print_help() {
fprintf( stderr, " for quick and random positioned data extraction.\n" );
fprintf( stderr, " No more waiting when the end of a 10 GiB gzip is needed!\n" );
fprintf( stderr, " //github.com/circulosmeos/gztool (by Roberto S. Galende)\n\n" );
fprintf( stderr, " $ gztool [-b #] [-s #] [-v #] [-cdeEfFhilStTW] [-I <INDEX>] <FILE>...\n\n" );
fprintf( stderr, " $ gztool [-b #] [-s #] [-v #] [-cCdeEfFhilStTW] [-I <INDEX>] <FILE>...\n\n" );
fprintf( stderr, " Note that actions `-bStT` proceed to an index file creation (if\n" );
fprintf( stderr, " none exists) INTERLEAVED with data extraction. As extraction and\n" );
fprintf( stderr, " index creation occur at the same time there's no waste of time.\n" );
Expand All @@ -2494,6 +2506,7 @@ local void print_help() {
fprintf( stderr, " -b #: extract data from indicated uncompressed byte position of\n" );
fprintf( stderr, " gzip file (creating or reusing an index file) to STDOUT.\n" );
fprintf( stderr, " Accepts '0', '0x', and suffixes 'kmgtpe' (^10) 'KMGTPE' (^2).\n" );
fprintf( stderr, " -C: always create a 'Complete' index file, ignoring possible errors\n" );
fprintf( stderr, " -c: utility: raw-gzip-compress indicated file to STDOUT\n" );
fprintf( stderr, " -d: utility: raw-gzip-decompress indicated file to STDOUT\n" );
fprintf( stderr, " -e: if multiple files are indicated, continue on error (if any)\n" );
Expand Down Expand Up @@ -2551,6 +2564,7 @@ int main(int argc, char **argv)
int force_strict_order = 0;
int write_index_to_disk = 1;
int end_on_first_proper_gzip_eof = 0;
int always_create_a_complete_index = 0;
int count_errors = 0;

enum EXIT_APP_VALUES ret_value;
Expand All @@ -2565,7 +2579,7 @@ int main(int argc, char **argv)

action = ACT_NOT_SET;
ret_value = EXIT_OK;
while ((opt = getopt(argc, argv, "b:cdeEfFhiI:ls:StTv:W")) != -1)
while ((opt = getopt(argc, argv, "b:cCdeEfFhiI:ls:StTv:W")) != -1)
switch (opt) {
// help
case 'h':
Expand All @@ -2585,6 +2599,10 @@ int main(int argc, char **argv)
action = ACT_COMPRESS_CHUNK;
actions_set++;
break;
// `-C` generates always a complete index file, ignoring possible decompression errors
case 'C':
always_create_a_complete_index = 1;
break;
// `-d` decompress <FILE> (or stdin if none) to stdout
case 'd':
action = ACT_DECOMPRESS_CHUNK;
Expand Down Expand Up @@ -2665,7 +2683,7 @@ int main(int argc, char **argv)
case '?':
if ( isprint (optopt) ) {
// print warning only if char option is unknown
if ( NULL == strchr("bcdeEfFhiIlSstTvW", optopt) ) {
if ( NULL == strchr("bcCdeEfFhiIlSstTvW", optopt) ) {
printToStderr( VERBOSITY_NORMAL, "Unknown option `-%c'.\n", optopt);
print_help();
}
Expand Down Expand Up @@ -2703,15 +2721,16 @@ int main(int argc, char **argv)
if ( ( action == ACT_COMPRESS_CHUNK || action == ACT_DECOMPRESS_CHUNK ) &&
( force_action == 1 || force_strict_order == 1 || write_index_to_disk == 0 ||
span_between_points != SPAN || index_filename_indicated == 1 ||
end_on_first_proper_gzip_eof == 1 )
end_on_first_proper_gzip_eof == 1 || always_create_a_complete_index == 1 )
) {
printToStderr( VERBOSITY_NORMAL, "WARNING: Ignoring `-EfFIsW` with `-[cd]`\n" );
printToStderr( VERBOSITY_NORMAL, "WARNING: Ignoring `-CEfFIsW` with `-[cd]`\n" );
force_action = 0;
force_strict_order = 0;
write_index_to_disk = 1;
span_between_points = SPAN;
index_filename_indicated = 0;
end_on_first_proper_gzip_eof = 0;
always_create_a_complete_index = 0;
}

if ( span_between_points <= 0 ) {
Expand Down Expand Up @@ -2828,7 +2847,8 @@ int main(int argc, char **argv)
if ( index_filename_indicated == 1 ) {
ret_value = action_create_index( "", &index, index_filename,
EXTRACT_FROM_BYTE, extract_from_byte, span_between_points,
write_index_to_disk, end_on_first_proper_gzip_eof );
write_index_to_disk, end_on_first_proper_gzip_eof,
always_create_a_complete_index );
printToStderr( VERBOSITY_NORMAL, "\n" );
break;
} else {
Expand Down Expand Up @@ -2867,10 +2887,12 @@ int main(int argc, char **argv)
// stdin is a gzip file that must be indexed
if ( index_filename_indicated == 1 ) {
ret_value = action_create_index( "", &index, index_filename, JUST_CREATE_INDEX,
0, span_between_points, write_index_to_disk, end_on_first_proper_gzip_eof );
0, span_between_points, write_index_to_disk, end_on_first_proper_gzip_eof,
always_create_a_complete_index );
} else {
ret_value = action_create_index( "", &index, "", JUST_CREATE_INDEX,
0, span_between_points, write_index_to_disk, end_on_first_proper_gzip_eof );
0, span_between_points, write_index_to_disk, end_on_first_proper_gzip_eof,
always_create_a_complete_index );
}
printToStderr( VERBOSITY_NORMAL, "\n" );
break;
Expand All @@ -2890,10 +2912,12 @@ int main(int argc, char **argv)
// stdin is a gzip file for which an index file must be created on-the-fly
if ( index_filename_indicated == 1 ) {
ret_value = action_create_index( "", &index, index_filename, SUPERVISE_DO, 0,
span_between_points, write_index_to_disk, end_on_first_proper_gzip_eof );
span_between_points, write_index_to_disk, end_on_first_proper_gzip_eof,
always_create_a_complete_index );
} else {
ret_value = action_create_index( "", &index, "", SUPERVISE_DO, 0,
span_between_points, write_index_to_disk, end_on_first_proper_gzip_eof );
span_between_points, write_index_to_disk, end_on_first_proper_gzip_eof,
always_create_a_complete_index );
}
printToStderr( VERBOSITY_NORMAL, "\n" );
break;
Expand All @@ -2903,7 +2927,8 @@ int main(int argc, char **argv)
if ( index_filename_indicated == 1 ) {
ret_value = action_create_index( "", &index, index_filename,
EXTRACT_TAIL, 0, span_between_points,
write_index_to_disk, end_on_first_proper_gzip_eof );
write_index_to_disk, end_on_first_proper_gzip_eof,
always_create_a_complete_index );
} else {
// if an index filename is not indicated, index will not be output
// as stdout is already used for data extraction
Expand All @@ -2916,11 +2941,13 @@ int main(int argc, char **argv)
if ( index_filename_indicated == 1 ) {
ret_value = action_create_index( "", &index, index_filename,
SUPERVISE_DO_AND_EXTRACT_FROM_TAIL, 0, span_between_points,
write_index_to_disk, end_on_first_proper_gzip_eof );
write_index_to_disk, end_on_first_proper_gzip_eof,
always_create_a_complete_index );
} else {
ret_value = action_create_index( "", &index, "",
SUPERVISE_DO_AND_EXTRACT_FROM_TAIL, 0, span_between_points,
write_index_to_disk, end_on_first_proper_gzip_eof );
write_index_to_disk, end_on_first_proper_gzip_eof,
always_create_a_complete_index );
}
printToStderr( VERBOSITY_NORMAL, "\n" );
break;
Expand Down Expand Up @@ -3013,7 +3040,7 @@ int main(int argc, char **argv)
if ( force_strict_order == 1 ) {
ret_value = action_create_index( file_name, &index, index_filename,
JUST_CREATE_INDEX, 0, span_between_points, write_index_to_disk,
end_on_first_proper_gzip_eof );
end_on_first_proper_gzip_eof, always_create_a_complete_index );
}


Expand All @@ -3023,7 +3050,7 @@ int main(int argc, char **argv)
case ACT_EXTRACT_FROM_BYTE:
ret_value = action_create_index( file_name, &index, index_filename,
EXTRACT_FROM_BYTE, extract_from_byte, span_between_points, write_index_to_disk,
end_on_first_proper_gzip_eof );
end_on_first_proper_gzip_eof, always_create_a_complete_index );
break;

case ACT_COMPRESS_CHUNK:
Expand Down Expand Up @@ -3061,7 +3088,7 @@ int main(int argc, char **argv)
// if force_strict_order == 1 action has already been done!
ret_value = action_create_index( file_name, &index, index_filename,
JUST_CREATE_INDEX, 0, span_between_points, write_index_to_disk,
end_on_first_proper_gzip_eof );
end_on_first_proper_gzip_eof, always_create_a_complete_index );
break;

case ACT_LIST_INFO:
Expand All @@ -3077,20 +3104,20 @@ int main(int argc, char **argv)
case ACT_SUPERVISE:
ret_value = action_create_index( file_name, &index, index_filename,
SUPERVISE_DO, 0, span_between_points, write_index_to_disk,
end_on_first_proper_gzip_eof );
end_on_first_proper_gzip_eof, always_create_a_complete_index );
printToStderr( VERBOSITY_NORMAL, "\n" );
break;

case ACT_EXTRACT_TAIL:
ret_value = action_create_index( file_name, &index, index_filename,
EXTRACT_TAIL, 0, span_between_points, write_index_to_disk,
end_on_first_proper_gzip_eof );
end_on_first_proper_gzip_eof, always_create_a_complete_index );
break;

case ACT_EXTRACT_TAIL_AND_CONTINUE:
ret_value = action_create_index( file_name, &index, index_filename,
SUPERVISE_DO_AND_EXTRACT_FROM_TAIL, 0, span_between_points, write_index_to_disk,
end_on_first_proper_gzip_eof );
end_on_first_proper_gzip_eof, always_create_a_complete_index );
printToStderr( VERBOSITY_NORMAL, "\n" );
break;

Expand Down

0 comments on commit b735dfc

Please sign in to comment.