0%

02-xz测试

xz命令参数

-z, --compress      force compression
-d, --decompress force decompression
-t, --test test compressed file integrity
-l, --list list information about .xz files
-k, --keep keep (don't delete) input files
-f, --force force overwrite of output file and (de)compress links
-c, --stdout write to standard output and don't delete input files
-0 ... -9 compression preset; default is 6; take compressor *and*
decompressor memory usage into account before using 7-9!
-e, --extreme try to improve compression ratio by using more CPU time;
does not affect decompressor memory requirements
-T, --threads=NUM use at most NUM threads; the default is 1; set to 0
to use as many threads as there are processor cores
-q, --quiet suppress warnings; specify twice to suppress errors too
-v, --verbose be verbose; specify twice for even more verbose
-h, --help display this short help and exit
-H, --long-help display the long help (lists also the advanced options)
-V, --version display the version number and exit

// With no FILE, or when FILE is -, read standard input.

xz命令算法

xz命令中的数字0-9表示压缩级别,数字越大表示压缩程度越高,但相应的压缩时间也会更长

在xz命令中,压缩级别0使用的是“存储”模式,不进行压缩。级别1-3使用的是LZMA1算法,级别4-9使用的是LZMA2算法。LZMA1和LZMA2都是基于LZ77算法和Huffman编码(详见综述)的压缩算法,但LZMA2还使用了一些额外的技术来提高压缩效率和速度。

xz压缩测试结果

机型:Macbook Pro 处理器:M1 PRO 8 核 内存16G 硬盘:固态512GB

文件:

image-20230505171943368

1000k行6列 小数点后4位浮点数

xz -1 -k -z 0_100_4decimals.csv -v

0_100_4decimals.csv (1/1)
100 % 20.5 MiB / 45.5 MiB = 0.450 15 MiB/s 0:03

xz -1 -T 4 -k -z 0_100_4decimals.csv -v

0_100_4decimals.csv (1/1)
100 % 20.4 MiB / 45.5 MiB = 0.449

xz -3 -k -z 0_100_4decimals.csv -v

0_100_4decimals.csv (1/1)
100 % 19.7 MiB / 45.5 MiB = 0.434 3.5 MiB/s 0:13

xz -3 -T 4 -k -z 0_100_4decimals.csv -v

0_100_4decimals.csv (1/1)
100 % 19.7 MiB / 45.5 MiB = 0.433 8.1 MiB/s 0:05

xz -4 -k -z 0_100_4decimals.csv -v

0_100_4decimals.csv (1/1)
100 % 17.8 MiB / 45.5 MiB = 0.391 1.9 MiB/s 0:23

xz -4 -T 4 -k -z 0_100_4decimals.csv -v

0_100_4decimals.csv (1/1)
100 % 17.9 MiB / 45.5 MiB = 0.392 5.3 MiB/s 0:08

xz -6 -k -z 0_100_4decimals.csv -v

0_100_4decimals.csv (1/1)
100 % 17.7 MiB / 45.5 MiB = 0.388 1.4 MiB/s 0:31

xz -6 -T 4 -k -z 0_100_4decimals.csv -v

0_100_4decimals.csv (1/1)
100 % 17.7 MiB / 45.5 MiB = 0.388 2.3 MiB/s 0:19

xz -9 -k -z 0_100_4decimals.csv -v

0_100_4decimals.csv (1/1)
100 % 17.6 MiB / 45.5 MiB = 0.386 1.0 MiB/s 0:45

xz -9 -T 4 -k -z 0_100_4decimals.csv -v

0_100_4decimals.csv (1/1)
100 % 17.6 MiB / 45.5 MiB = 0.386 1.0 MiB/s 0:44

xz -e 0_100_4decimals.csv -v

0_100_4decimals.csv (1/1)
100 % 17.6 MiB / 45.5 MiB = 0.387 1.4 MiB/s 0:31

1000k行6列 小数点后8位浮点数

xz -1 -k -z 0_100_8decimals.csv -v

0_100_8decimals.csv (1/1)
100 % 32.7 MiB / 68.4 MiB = 0.478 15 MiB/s 0:04

xz -1 -T 4 -k -z 0_100_8decimals.csv -v

0_100_8decimals.csv (1/1)
100 % 32.7 MiB / 68.4 MiB = 0.478 0:01

xz -3 -k -z 0_100_8decimals.csv -v

0_100_8decimals.csv (1/1)
100 % 32.1 MiB / 68.4 MiB = 0.470 4.4 MiB/s 0:15

xz -3 -T 4 -k -z 0_100_8decimals.csv -v

0_100_8decimals.csv (1/1)
100 % 32.1 MiB / 68.4 MiB = 0.469 7.7 MiB/s 0:08

xz -4 -k -z 0_100_8decimals.csv -v

0_100_8decimals.csv (1/1)
100 % 29.0 MiB / 68.4 MiB = 0.423 1.8 MiB/s 0:38

xz -4 -T 4 -k -z 0_100_8decimals.csv -v

0_100_8decimals.csv (1/1)
100 % 29.0 MiB / 68.4 MiB = 0.425 4.3 MiB/s 0:15

xz -6 -k -z 0_100_8decimals.csv -v

0_100_8decimals.csv (1/1)
100 % 28.8 MiB / 68.4 MiB = 0.421 1.3 MiB/s 0:51

xz -6 -T 4 -k -z 0_100_8decimals.csv -v

0_100_8decimals.csv (1/1)
100 % 28.9 MiB / 68.4 MiB = 0.422 2.9 MiB/s 0:23

xz -9 -k -z 0_100_8decimals.csv -v

0_100_8decimals.csv (1/1)
100 % 28.7 MiB / 68.4 MiB = 0.419 869 KiB/s 1:20

xz -9 -T 4 -k -z 0_100_8decimals.csv -v

0_100_8decimals.csv (1/1)
100 % 28.7 MiB / 68.4 MiB = 0.419 880 KiB/s 1:19

xz -e 0_100_8decimals.csv -v

0_100_8decimals.csv (1/1)
100 % 28.8 MiB / 68.4 MiB = 0.421 1.3 MiB/s 0:53

1000k行6列 小数点后16位浮点数

xz -1 -k -z 0_100_16decimals.csv -v

0_100_16decimals.csv (1/1)
100 % 50.7 MiB / 105.0 MiB = 0.483 15 MiB/s 0:07

xz -1 -T 4 -k -z 0_100_16decimals.csv -v

0_100_16decimals.csv (1/1)
100 % 50.7 MiB / 105.0 MiB = 0.483 0:01

xz -3 -k -z 0_100_16decimals.csv -v

0_100_16decimals.csv (1/1)
100 % 50.1 MiB / 105.0 MiB = 0.477 4.6 MiB/s 0:22

xz -3 -T 4 -k -z 0_100_16decimals.csv -v

0_100_16decimals.csv (1/1)
100 % 50.1 MiB / 105.0 MiB = 0.477 9.9 MiB/s 0:10

xz -4 -k -z 0_100_16decimals.csv -v

0_100_16decimals.csv (1/1)
100 % 47.2 MiB / 105.0 MiB = 0.450 1.8 MiB/s 0:58

xz -4 -T 4 -k -z 0_100_16decimals.csv -v

0_100_16decimals.csv (1/1)
100 % 47.3 MiB / 105.0 MiB = 0.451 4.8 MiB/s 0:21

xz -6 -k -z 0_100_16decimals.csv -v

0_100_16decimals.csv (1/1)
100 % 47.2 MiB / 105.0 MiB = 0.449 1.3 MiB/s 1:20

xz -6 -T 4 -k -z 0_100_16decimals.csv -v

0_100_16decimals.csv (1/1)
100 % 47.2 MiB / 105.0 MiB = 0.450 3.6 MiB/s 0:28

xz -9 -k -z 0_100_16decimals.csv -v

0_100_16decimals.csv (1/1)
100 % 46.7 MiB / 105.0 MiB = 0.445 791 KiB/s 2:15

xz -9 -T 4 -k -z 0_100_16decimals.csv -v

0_100_16decimals.csv (1/1)
100 % 46.7 MiB / 105.0 MiB = 0.445 790 KiB/s 2:16

xz -9 -T 4 -e -k -z 0_100_16decimals.csv -v

0_100_16decimals.csv (1/1)
100 % 47.1 MiB / 105.0 MiB = 0.449 1.3 MiB/s 1:21

读取xz文件

动态数组 / 单线程

/*
* @Author : Kong Jiangang
* @Date : 2023-05-06 21:15:21
* @LastEditors: kjg 651373472@qq.com
* @LastEditTime: 2023-05-07 03:08:58
* @Description :
* @FilePath : /read_xz/read_xz.cpp
*/
#include <sstream>
#include <iostream>
#include <fstream>
#include <lzma.h>
using namespace std;

vector<vector<double>> arr(1, vector<double>(1, 0));
vector<vector<string>> arr2(1, vector<string>(1, ""));

void readXz(string path = "/Users/jiangangkong/workSpace/pycharmWorkSpace/pythonQuant/test_file/0_100_4decimals.csv.xz")
{
string filename = path;
ifstream input(filename, ios::binary);
lzma_stream stream = LZMA_STREAM_INIT;

const int bufsize = 1024 * 1000;
char inbuf[bufsize];
char outbuf[bufsize];

lzma_ret ret = lzma_stream_decoder(&stream, UINT64_MAX, 0);
if (ret != LZMA_OK)
{
cerr << "Failed to initialize the decoder: " << lzma_ret(ret) << endl;
return;
}

while (!input.eof())
{
input.read(inbuf, bufsize);
stream.avail_in = input.gcount();
stream.next_in = reinterpret_cast<const uint8_t *>(inbuf);
int j = 0;
do
{
stream.avail_out = bufsize;
stream.next_out = reinterpret_cast<uint8_t *>(outbuf);

ret = lzma_code(&stream, LZMA_RUN);
if (ret != LZMA_OK && ret != LZMA_STREAM_END)
{
cerr << "Decoding failed: " << lzma_ret(ret) << endl;
return;
}
// cout.write(outbuf, bufsize - stream.avail_out);

// TODO: 将outbuf中的前 bufsize - stream.avail_out 个字节转换为二维数组
int bytes_written = bufsize - stream.avail_out;
const char *p = outbuf;
for (int i = 0; i < bytes_written;)
{
if (*p == ',' || isalpha(*p) || *p == '\0' || *p == '\r')
{
i++;
p++;
continue;
}
if (*p == '\n')
{
arr2.emplace_back(); // 添加新行
p++;
i++;
continue;
}
string number = "";
while (*p != ',' && *p != '\n' && i < bytes_written && *p != '\0' && *p != '\r')
{
number += (*p);
i++;
p++;
}
arr2.back().emplace_back(number); // 添加新列
}

} while (stream.avail_out == 0); // 该while循环会一直执行, 直到输出缓冲区中有剩余空间为止, 也就是stream.avail_out不再等于0
}

lzma_end(&stream);
input.close();
return;
}

int main()
{
clock_t start = clock();
readXz();
cout << endl;
cout << "arr2.size():" << arr2.size() << " "
<< "arr2[0].size():" << arr2[0].size() << endl;
clock_t end = clock();
cout << endl;
cout << "The run time is: " << (double)(end - start) / CLOCKS_PER_SEC << "s" << endl;

return 0;
}
The run time is: 2.29194s
The run time is: 2.27807s
The run time is: 2.32385s

一次性分配内存 / 单线程

/*
* @Author : Kong Jiangang
* @Date : 2023-05-06 21:15:21
* @LastEditors: kjg 651373472@qq.com
* @LastEditTime: 2023-05-07 03:08:58
* @Description :
* @FilePath : /read_xz/read_xz_frd.cpp
*/
#include <sstream>
#include <iostream>
#include <fstream>
#include <lzma.h>
using namespace std;
#define ROWS 1000000
#define COLS 6

int r = 0, c = 0;
vector<vector<double>> arr(1, vector<double>(1, 0));
// vector<vector<string>> arr2(1, vector<string>(1, ""));
vector<vector<string>> arr2(ROWS, vector<string>(COLS, ""));

void readXz(string path = "/Users/jiangangkong/workSpace/pycharmWorkSpace/pythonQuant/test_file/0_100_4decimals.csv.xz")
{
string filename = path;
FILE *fp = fopen(filename.c_str(), "rb");

lzma_stream stream = LZMA_STREAM_INIT;
const int bufsize = 1024 * 1024;
char inbuf[bufsize];
char outbuf[bufsize];

lzma_ret ret = lzma_stream_decoder(&stream, UINT64_MAX, 0);

while (!feof(fp))
{
size_t size = fread(inbuf, 1, bufsize, fp);
stream.avail_in = size;
stream.next_in = reinterpret_cast<const uint8_t *>(inbuf);

do
{
stream.avail_out = bufsize;
stream.next_out = reinterpret_cast<uint8_t *>(outbuf);

ret = lzma_code(&stream, LZMA_RUN);
if (ret != LZMA_OK && ret != LZMA_STREAM_END)
{
cerr << "Decoding failed: " << lzma_ret(ret) << endl;
return;
}

int bytes_written = bufsize - stream.avail_out;
const char *p = outbuf;
for (int i = 0; i < bytes_written;)
{
if (*p == ',' || isalpha(*p) || *p == '\0' || *p == '\r')
{
i++;
p++;
continue;
}
if (*p == '\n')
{
r++;
p++;
i++;
continue;
}
string number = "";
while (*p != ',' && *p != '\n' && i < bytes_written && *p != '\0' && *p != '\r')
{
number += (*p);
i++;
p++;
}
arr2[r][(c++) % 6] = number;
}

} while (stream.avail_out == 0);
}

lzma_end(&stream);
fclose(fp);
return;
}

int main()
{
clock_t start = clock();
readXz();
cout << endl;
// cout << "arr2.size():" << arr2.size() << " "
// << "arr2[0].size():" << arr2[0].size() << endl;
clock_t end = clock();
cout << endl;
cout << "The run time is: " << (double)(end - start) / CLOCKS_PER_SEC << "s" << endl;

// for (int i = 0; i < arr2.size(); i++)
// {
// for (int j = 0; j < arr2[0].size(); j++)
// cout << arr2[i][j] << " ";
// cout << endl;
// }

return 0;
}
The run time is: 1.04217s
The run time is: 1.04364s
The run time is: 1.04482s

一次性分配内存 / 单线程 编译器-O2优化

/*
* @Author : Kong Jiangang
* @Date : 2023-05-06 21:15:21
* @LastEditors: kjg 651373472@qq.com
* @LastEditTime: 2023-05-07 03:08:58
* @Description :
* @FilePath : /read_xz/read_xz_frd.cpp
*/
#include <sstream>
#include <iostream>
#include <fstream>
#include <lzma.h>
using namespace std;

#define BUF_SIZE 1024 * 1024
#define ROWS 100000000
#define COLS 7

int c = 0;

string *arr = new string[ROWS * COLS];

inline bool is_separator(char c)
{
return (c == ',' || c == '\0' || c == '\r' || c == '\n' || isalpha(c));
}

void readXz(string path = "/Users/jiangangkong/workSpace/pycharmWorkSpace/pythonQuant/test_file/0_100_4decimals_large.csv.xz")
{
string filename = path;
FILE *fp = fopen(filename.c_str(), "rb");

lzma_stream stream = LZMA_STREAM_INIT;
char inbuf[BUF_SIZE];
char outbuf[BUF_SIZE];

lzma_ret ret = lzma_stream_decoder(&stream, UINT64_MAX, 0);

while (!feof(fp))
{
size_t size = fread(inbuf, 1, BUF_SIZE, fp);
stream.avail_in = size;
stream.next_in = reinterpret_cast<const uint8_t *>(inbuf);

do
{
stream.avail_out = BUF_SIZE;
stream.next_out = reinterpret_cast<uint8_t *>(outbuf);

ret = lzma_code(&stream, LZMA_RUN);

int bytes_written = BUF_SIZE - stream.avail_out;
char *p = outbuf;
for (int i = 0; i < bytes_written;)
{
if (is_separator(*p))
{
++i,++p;
continue;
}
string number = "";
while (*p != ',' && *p != '\n' && i < bytes_written && *p != '\0' && *p != '\r')
{
number.push_back(*p);
++i, ++p;
}
arr[(c++)] = number;
// cout << arr[c - 1] << " ";
}
// char *p_end = p + bytes_written;
// while (p < p_end)
// {
// if (is_separator(*p))
// {
// ++p;
// continue;
// }
// string number;
// number.reserve(10);
// while (*p != ',' && *p != '\n' && *p != '\0' && *p != '\r' && p < p_end)
// {
// number.push_back(*p++);
// }
// *arr++ = number;
// }
} while (stream.avail_out == 0);
}

lzma_end(&stream);
fclose(fp);
return;
}

int main()
{
clock_t start = clock();
readXz();
clock_t end = clock();
cout << "The run time is: " << (double)(end - start) / CLOCKS_PER_SEC << "s" << endl;

return 0;
}
g++ -Iinclude -I/opt/homebrew/Cellar/xz/5.4.2/include -std=c++17 read_xz_frd.cpp -o read_xz_frd -O2  -llzma
The run time is: 0.813454s
The run time is: 0.811s
The run time is: 0.811843s

一次性分配内存 / 多线程 【未完成】

#include <condition_variable>
#include <iostream>
#include <fstream>
#include <string>
#include <thread>
#include <lzma.h>
#include <queue>
#include <mutex>

#define BUF_SIZE 1024
#define MAX_ROWS 1000000
#define MAX_COLS 6
using namespace std;

condition_variable cv;
mutex mtx; // 对于 lzma_stream 的操作需要加锁
bool is_end = false;

char inbuf[BUF_SIZE];
char outbuf[BUF_SIZE];
string arr[MAX_ROWS][MAX_COLS];
int r = 0, c = 0;

/*
next_in 和 avail_in:分别指向输入缓冲区的下一个字节和还未处理的字节数。
next_out 和 avail_out:分别指向输出缓冲区的下一个字节和剩余的空间大小。
total_in 和 total_out:分别记录已经处理的输入和输出字节数。
*/

// 生产者线程函数
void producer_thread(ifstream &input_file, lzma_stream &stream)
{
while (!input_file.eof())
{
unique_lock<mutex> lock(mtx);
cout << "生产者线程已加锁" << endl;
// 生产数据:还未处理的字节数 > 0
cv.wait(lock, [&stream]
{ return stream.avail_in > 0 || stream.avail_in == INT_MAX; });
cout << "生产者线程开始执行" << endl;

input_file.read(inbuf, BUF_SIZE);
stream.avail_in = input_file.gcount();
stream.next_in = reinterpret_cast<const uint8_t *>(inbuf);

cv.notify_one();
}
is_end = true;
cv.notify_all();
}

// 消费者线程函数
void consumer_thread(lzma_stream &stream)
{
while (true)
{
unique_lock<mutex> lock(mtx);
cout << "消费者线程已加锁" << endl;
// 等待直到有需要处理的数据
cv.wait(lock, [&stream]
{ return stream.avail_out == 0 || is_end; });
cout << "消费者线程开始执行" << endl;

stream.avail_out = BUF_SIZE;
stream.next_out = reinterpret_cast<uint8_t *>(outbuf);
lzma_ret ret = lzma_code(&stream, LZMA_RUN);
if (ret != LZMA_OK && ret != LZMA_STREAM_END)
{
cerr << "Decoding failed: " << lzma_ret(ret) << endl;
return;
}
int bytes_written = BUF_SIZE - stream.avail_out;
const char *p = outbuf;
for (int i = 0; i < bytes_written;)
{
if (*p == ',' || isalpha(*p) || *p == '\0' || *p == '\r')
{
i++;
p++;
continue;
}
if (*p == '\n')
{
// arr2.emplace_back(); // 添加新行
r++;
p++;
i++;
continue;
}
string number = "";
while (*p != ',' && *p != '\n' && i < bytes_written && *p != '\0' && *p != '\r')
{
number += (*p);
i++;
p++;
}
cout << number << " ";
arr[r][(c++) % 6] = number;
} // 该while循环会一直执行, 直到输出缓冲区中有剩余空间为止, 也就是stream.avail_out不再等于0

// 在解压缩数据之后,stream.avail_out 将被更新为剩余的可用空间大小,outbuf 中存储的数据是解压缩后的数据,大小为 bufsize - stream.avail_out。
if (is_end && stream.avail_out == BUF_SIZE)
{
return;
}
}
}

int main()
{
clock_t start = clock();

ifstream input_file("/Users/jiangangkong/workSpace/pycharmWorkSpace/pythonQuant/test_file/0_100_4decimals.csv.xz", ios::binary);
lzma_stream stream = LZMA_STREAM_INIT;
lzma_ret ret = lzma_stream_decoder(&stream, UINT64_MAX, 0);
stream.avail_in = INT_MAX;

thread producer(producer_thread, ref(input_file), ref(stream));
thread consumer(consumer_thread, ref(stream));

producer.join();
consumer.join();

clock_t end = clock();

cout << arr[1][5];
cout << "The run time is: " << (double)(end - start) / CLOCKS_PER_SEC << "s" << endl;
return 0;
}

*

  • 查看lzma库的版本
lzma --version
  • 查看编译器标志
pkg-config --cflags liblzma
# -I/opt/homebrew/Cellar/xz/5.4.2/include