xz命令参数 -z, --compress force compression -d, --decompress force decompression -t, --test test compressed file integrity -l, --list list information about .xz files -k, --keep keep (don't delete) input files -f, --force force overwrite of output file and (de)compress links -c, --stdout write to standard output and don' t delete input files-0 ... -9 compression preset; default is 6 ; take compressor *and * decompressor memory usage into account before using 7 -9 ! -e, --extreme try to improve compression ratio by using more CPU time; does not affect decompressor memory requirements -T, --threads=NUM use at most NUM threads; the default is 1 ; set to 0 to use as many threads as there are processor cores -q, --quiet suppress warnings; specify twice to suppress errors too -v, --verbose be verbose; specify twice for even more verbose -h, --help display this short help and exit -H, --long -help display the long help (lists also the advanced options) -V, --version display the version number and exit
xz命令算法
xz命令中的数字0-9表示压缩级别,数字越大表示压缩程度越高,但相应的压缩时间也会更长
在xz命令中,压缩级别0使用的是“存储”模式,不进行压缩。级别1-3使用的是LZMA1算法,级别4-9使用的是LZMA2算法。LZMA1和LZMA2都是基于LZ77算法和Huffman编码 (详见综述)的压缩算法,但LZMA2还使用了一些额外的技术来提高压缩效率和速度。
xz压缩测试结果 机型:Macbook Pro
处理器:M1 PRO 8 核
内存16G
硬盘:固态512GB
文件:
1000k行6列 小数点后4位浮点数 xz -1 -k -z 0_100_4decimals.csv -v
0_100_4decimals .csv (1 /1 ) 100 % 20 .5 MiB / 45 .5 MiB = 0 .450 15 MiB/s 0 :03
xz -1 -T 4 -k -z 0_100_4decimals.csv -v
0_100_4decimals .csv (1 /1 ) 100 % 20 .4 MiB / 45 .5 MiB = 0 .449
xz -3 -k -z 0_100_4decimals.csv -v
0_100_4decimals .csv (1 /1 ) 100 % 19 .7 MiB / 45 .5 MiB = 0 .434 3 .5 MiB/s 0 :13
xz -3 -T 4 -k -z 0_100_4decimals.csv -v
0_100_4decimals .csv (1 /1 ) 100 % 19 .7 MiB / 45 .5 MiB = 0 .433 8 .1 MiB/s 0 :05
xz -4 -k -z 0_100_4decimals.csv -v
0_100_4decimals .csv (1 /1 ) 100 % 17 .8 MiB / 45 .5 MiB = 0 .391 1 .9 MiB/s 0 :23
xz -4 -T 4 -k -z 0_100_4decimals.csv -v
0_100_4decimals .csv (1 /1 ) 100 % 17 .9 MiB / 45 .5 MiB = 0 .392 5 .3 MiB/s 0 :08
xz -6 -k -z 0_100_4decimals.csv -v
0_100_4decimals .csv (1 /1 ) 100 % 17 .7 MiB / 45 .5 MiB = 0 .388 1 .4 MiB/s 0 :31
xz -6 -T 4 -k -z 0_100_4decimals.csv -v
0_100_4decimals .csv (1 /1 ) 100 % 17 .7 MiB / 45 .5 MiB = 0 .388 2 .3 MiB/s 0 :19
xz -9 -k -z 0_100_4decimals.csv -v
0_100_4decimals .csv (1 /1 ) 100 % 17 .6 MiB / 45 .5 MiB = 0 .386 1 .0 MiB/s 0 :45
xz -9 -T 4 -k -z 0_100_4decimals.csv -v
0_100_4decimals .csv (1 /1 ) 100 % 17 .6 MiB / 45 .5 MiB = 0 .386 1 .0 MiB/s 0 :44
xz -e 0_100_4decimals.csv -v
0_100_4decimals .csv (1 /1 ) 100 % 17 .6 MiB / 45 .5 MiB = 0 .387 1 .4 MiB/s 0 :31
1000k行6列 小数点后8位浮点数 xz -1 -k -z 0_100_8decimals.csv -v
0_100_8decimals .csv (1 /1 ) 100 % 32 .7 MiB / 68 .4 MiB = 0 .478 15 MiB/s 0 :04
xz -1 -T 4 -k -z 0_100_8decimals.csv -v
0_100_8decimals .csv (1 /1 ) 100 % 32 .7 MiB / 68 .4 MiB = 0 .478 0 :01
xz -3 -k -z 0_100_8decimals.csv -v
0_100_8decimals .csv (1 /1 ) 100 % 32 .1 MiB / 68 .4 MiB = 0 .470 4 .4 MiB/s 0 :15
xz -3 -T 4 -k -z 0_100_8decimals.csv -v
0_100_8decimals .csv (1 /1 ) 100 % 32 .1 MiB / 68 .4 MiB = 0 .469 7 .7 MiB/s 0 :08
xz -4 -k -z 0_100_8decimals.csv -v
0_100_8decimals .csv (1 /1 ) 100 % 29 .0 MiB / 68 .4 MiB = 0 .423 1 .8 MiB/s 0 :38
xz -4 -T 4 -k -z 0_100_8decimals.csv -v
0_100_8decimals .csv (1 /1 ) 100 % 29 .0 MiB / 68 .4 MiB = 0 .425 4 .3 MiB/s 0 :15
xz -6 -k -z 0_100_8decimals.csv -v
0_100_8decimals .csv (1 /1 ) 100 % 28 .8 MiB / 68 .4 MiB = 0 .421 1 .3 MiB/s 0 :51
xz -6 -T 4 -k -z 0_100_8decimals.csv -v
0_100_8decimals .csv (1 /1 ) 100 % 28 .9 MiB / 68 .4 MiB = 0 .422 2 .9 MiB/s 0 :23
xz -9 -k -z 0_100_8decimals.csv -v
0_100_8decimals .csv (1 /1 ) 100 % 28 .7 MiB / 68 .4 MiB = 0 .419 869 KiB/s 1 :20
xz -9 -T 4 -k -z 0_100_8decimals.csv -v
0_100_8decimals .csv (1 /1 ) 100 % 28 .7 MiB / 68 .4 MiB = 0 .419 880 KiB/s 1 :19
xz -e 0_100_8decimals.csv -v
0_100_8decimals .csv (1 /1 ) 100 % 28 .8 MiB / 68 .4 MiB = 0 .421 1 .3 MiB/s 0 :53
1000k行6列 小数点后16位浮点数 xz -1 -k -z 0_100_16decimals.csv -v
0_100_16decimals .csv (1 /1 ) 100 % 50 .7 MiB / 105 .0 MiB = 0 .483 15 MiB/s 0 :07
xz -1 -T 4 -k -z 0_100_16decimals.csv -v
0_100_16decimals .csv (1 /1 ) 100 % 50 .7 MiB / 105 .0 MiB = 0 .483 0 :01
xz -3 -k -z 0_100_16decimals.csv -v
0_100_16decimals .csv (1 /1 ) 100 % 50 .1 MiB / 105 .0 MiB = 0 .477 4 .6 MiB/s 0 :22
xz -3 -T 4 -k -z 0_100_16decimals.csv -v
0_100_16decimals .csv (1 /1 ) 100 % 50 .1 MiB / 105 .0 MiB = 0 .477 9 .9 MiB/s 0 :10
xz -4 -k -z 0_100_16decimals.csv -v
0_100_16decimals .csv (1 /1 ) 100 % 47 .2 MiB / 105 .0 MiB = 0 .450 1 .8 MiB/s 0 :58
xz -4 -T 4 -k -z 0_100_16decimals.csv -v
0_100_16decimals .csv (1 /1 ) 100 % 47 .3 MiB / 105 .0 MiB = 0 .451 4 .8 MiB/s 0 :21
xz -6 -k -z 0_100_16decimals.csv -v
0_100_16decimals .csv (1 /1 ) 100 % 47 .2 MiB / 105 .0 MiB = 0 .449 1 .3 MiB/s 1 :20
xz -6 -T 4 -k -z 0_100_16decimals.csv -v
0_100_16decimals .csv (1 /1 ) 100 % 47 .2 MiB / 105 .0 MiB = 0 .450 3 .6 MiB/s 0 :28
xz -9 -k -z 0_100_16decimals.csv -v
0_100_16decimals .csv (1 /1 ) 100 % 46 .7 MiB / 105 .0 MiB = 0 .445 791 KiB/s 2 :15
xz -9 -T 4 -k -z 0_100_16decimals.csv -v
0_100_16decimals .csv (1 /1 ) 100 % 46 .7 MiB / 105 .0 MiB = 0 .445 790 KiB/s 2 :16
xz -9 -T 4 -e -k -z 0_100_16decimals.csv -v
0_100_16decimals .csv (1 /1 ) 100 % 47 .1 MiB / 105 .0 MiB = 0 .449 1 .3 MiB/s 1 :21
读取xz文件 动态数组 / 单线程 #include <sstream> #include <iostream> #include <fstream> #include <lzma.h> using namespace std;vector<vector<double >> arr (1 , vector <double >(1 , 0 )); vector<vector<string>> arr2 (1 , vector <string>(1 , "" )); void readXz (string path = "/Users/jiangangkong/workSpace/pycharmWorkSpace/pythonQuant/test_file/0_100_4decimals.csv.xz" ) { string filename = path; ifstream input (filename, ios::binary) ; lzma_stream stream = LZMA_STREAM_INIT; const int bufsize = 1024 * 1000 ; char inbuf[bufsize]; char outbuf[bufsize]; lzma_ret ret = lzma_stream_decoder (&stream, UINT64_MAX, 0 ); if (ret != LZMA_OK) { cerr << "Failed to initialize the decoder: " << lzma_ret (ret) << endl; return ; } while (!input.eof ()) { input.read (inbuf, bufsize); stream.avail_in = input.gcount (); stream.next_in = reinterpret_cast <const uint8_t *>(inbuf); int j = 0 ; do { stream.avail_out = bufsize; stream.next_out = reinterpret_cast <uint8_t *>(outbuf); ret = lzma_code (&stream, LZMA_RUN); if (ret != LZMA_OK && ret != LZMA_STREAM_END) { cerr << "Decoding failed: " << lzma_ret (ret) << endl; return ; } int bytes_written = bufsize - stream.avail_out; const char *p = outbuf; for (int i = 0 ; i < bytes_written;) { if (*p == ',' || isalpha (*p) || *p == '\0' || *p == '\r' ) { i++; p++; continue ; } if (*p == '\n' ) { arr2.emplace_back (); p++; i++; continue ; } string number = "" ; while (*p != ',' && *p != '\n' && i < bytes_written && *p != '\0' && *p != '\r' ) { number += (*p); i++; p++; } arr2.back ().emplace_back (number); } } while (stream.avail_out == 0 ); } lzma_end (&stream); input.close (); return ; } int main () { clock_t start = clock (); readXz (); cout << endl; cout << "arr2.size():" << arr2.size () << " " << "arr2[0].size():" << arr2[0 ].size () << endl; clock_t end = clock (); cout << endl; cout << "The run time is: " << (double )(end - start) / CLOCKS_PER_SEC << "s" << endl; return 0 ; }
The run time is: 2.29194 s The run time is: 2.27807 s The run time is: 2.32385 s
一次性分配内存 / 单线程 #include <sstream> #include <iostream> #include <fstream> #include <lzma.h> using namespace std;#define ROWS 1000000 #define COLS 6 int r = 0 , c = 0 ;vector<vector<double >> arr (1 , vector <double >(1 , 0 )); vector<vector<string>> arr2 (ROWS, vector <string>(COLS, "" )); void readXz (string path = "/Users/jiangangkong/workSpace/pycharmWorkSpace/pythonQuant/test_file/0_100_4decimals.csv.xz" ) { string filename = path; FILE *fp = fopen (filename.c_str (), "rb" ); lzma_stream stream = LZMA_STREAM_INIT; const int bufsize = 1024 * 1024 ; char inbuf[bufsize]; char outbuf[bufsize]; lzma_ret ret = lzma_stream_decoder (&stream, UINT64_MAX, 0 ); while (!feof (fp)) { size_t size = fread (inbuf, 1 , bufsize, fp); stream.avail_in = size; stream.next_in = reinterpret_cast <const uint8_t *>(inbuf); do { stream.avail_out = bufsize; stream.next_out = reinterpret_cast <uint8_t *>(outbuf); ret = lzma_code (&stream, LZMA_RUN); if (ret != LZMA_OK && ret != LZMA_STREAM_END) { cerr << "Decoding failed: " << lzma_ret (ret) << endl; return ; } int bytes_written = bufsize - stream.avail_out; const char *p = outbuf; for (int i = 0 ; i < bytes_written;) { if (*p == ',' || isalpha (*p) || *p == '\0' || *p == '\r' ) { i++; p++; continue ; } if (*p == '\n' ) { r++; p++; i++; continue ; } string number = "" ; while (*p != ',' && *p != '\n' && i < bytes_written && *p != '\0' && *p != '\r' ) { number += (*p); i++; p++; } arr2[r][(c++) % 6 ] = number; } } while (stream.avail_out == 0 ); } lzma_end (&stream); fclose (fp); return ; } int main () { clock_t start = clock (); readXz (); cout << endl; clock_t end = clock (); cout << endl; cout << "The run time is: " << (double )(end - start) / CLOCKS_PER_SEC << "s" << endl; return 0 ; }
The run time is: 1.04217 s The run time is: 1.04364 s The run time is: 1.04482 s
一次性分配内存 / 单线程 编译器-O2优化 #include <sstream> #include <iostream> #include <fstream> #include <lzma.h> using namespace std;#define BUF_SIZE 1024 * 1024 #define ROWS 100000000 #define COLS 7 int c = 0 ;string *arr = new string[ROWS * COLS]; inline bool is_separator (char c) { return (c == ',' || c == '\0' || c == '\r' || c == '\n' || isalpha (c)); } void readXz (string path = "/Users/jiangangkong/workSpace/pycharmWorkSpace/pythonQuant/test_file/0_100_4decimals_large.csv.xz" ) { string filename = path; FILE *fp = fopen (filename.c_str (), "rb" ); lzma_stream stream = LZMA_STREAM_INIT; char inbuf[BUF_SIZE]; char outbuf[BUF_SIZE]; lzma_ret ret = lzma_stream_decoder (&stream, UINT64_MAX, 0 ); while (!feof (fp)) { size_t size = fread (inbuf, 1 , BUF_SIZE, fp); stream.avail_in = size; stream.next_in = reinterpret_cast <const uint8_t *>(inbuf); do { stream.avail_out = BUF_SIZE; stream.next_out = reinterpret_cast <uint8_t *>(outbuf); ret = lzma_code (&stream, LZMA_RUN); int bytes_written = BUF_SIZE - stream.avail_out; char *p = outbuf; for (int i = 0 ; i < bytes_written;) { if (is_separator (*p)) { ++i,++p; continue ; } string number = "" ; while (*p != ',' && *p != '\n' && i < bytes_written && *p != '\0' && *p != '\r' ) { number.push_back (*p); ++i, ++p; } arr[(c++)] = number; } } while (stream.avail_out == 0 ); } lzma_end (&stream); fclose (fp); return ; } int main () { clock_t start = clock (); readXz (); clock_t end = clock (); cout << "The run time is: " << (double )(end - start) / CLOCKS_PER_SEC << "s" << endl; return 0 ; }
g++ -Iinclude -I/opt/homebrew/Cellar/xz/5.4 .2 /include -std=c++17 read_xz_frd.cpp -o read_xz_frd -O2 -llzma
The run time is: 0.813454 s The run time is: 0.811 s The run time is: 0.811843 s
一次性分配内存 / 多线程 【未完成】 #include <condition_variable> #include <iostream> #include <fstream> #include <string> #include <thread> #include <lzma.h> #include <queue> #include <mutex> #define BUF_SIZE 1024 #define MAX_ROWS 1000000 #define MAX_COLS 6 using namespace std;condition_variable cv; mutex mtx; bool is_end = false ;char inbuf[BUF_SIZE];char outbuf[BUF_SIZE];string arr[MAX_ROWS][MAX_COLS]; int r = 0 , c = 0 ;void producer_thread (ifstream &input_file, lzma_stream &stream) { while (!input_file.eof ()) { unique_lock<mutex> lock (mtx) ; cout << "生产者线程已加锁" << endl; cv.wait (lock, [&stream] { return stream.avail_in > 0 || stream.avail_in == INT_MAX; }); cout << "生产者线程开始执行" << endl; input_file.read (inbuf, BUF_SIZE); stream.avail_in = input_file.gcount (); stream.next_in = reinterpret_cast <const uint8_t *>(inbuf); cv.notify_one (); } is_end = true ; cv.notify_all (); } void consumer_thread (lzma_stream &stream) { while (true ) { unique_lock<mutex> lock (mtx) ; cout << "消费者线程已加锁" << endl; cv.wait (lock, [&stream] { return stream.avail_out == 0 || is_end; }); cout << "消费者线程开始执行" << endl; stream.avail_out = BUF_SIZE; stream.next_out = reinterpret_cast <uint8_t *>(outbuf); lzma_ret ret = lzma_code (&stream, LZMA_RUN); if (ret != LZMA_OK && ret != LZMA_STREAM_END) { cerr << "Decoding failed: " << lzma_ret (ret) << endl; return ; } int bytes_written = BUF_SIZE - stream.avail_out; const char *p = outbuf; for (int i = 0 ; i < bytes_written;) { if (*p == ',' || isalpha (*p) || *p == '\0' || *p == '\r' ) { i++; p++; continue ; } if (*p == '\n' ) { r++; p++; i++; continue ; } string number = "" ; while (*p != ',' && *p != '\n' && i < bytes_written && *p != '\0' && *p != '\r' ) { number += (*p); i++; p++; } cout << number << " " ; arr[r][(c++) % 6 ] = number; } if (is_end && stream.avail_out == BUF_SIZE) { return ; } } } int main () { clock_t start = clock (); ifstream input_file ("/Users/jiangangkong/workSpace/pycharmWorkSpace/pythonQuant/test_file/0_100_4decimals.csv.xz" , ios::binary) ; lzma_stream stream = LZMA_STREAM_INIT; lzma_ret ret = lzma_stream_decoder (&stream, UINT64_MAX, 0 ); stream.avail_in = INT_MAX; thread producer (producer_thread, ref(input_file), ref(stream)) ; thread consumer (consumer_thread, ref(stream)) ; producer.join (); consumer.join (); clock_t end = clock (); cout << arr[1 ][5 ]; cout << "The run time is: " << (double )(end - start) / CLOCKS_PER_SEC << "s" << endl; return 0 ; }
*
pkg-config --cflags liblzma