This example script reads in a comma seperated values file (Example file: helmi200.csv.) and outputs it to a hdf5 file that can be read by veax. Since writing the rows individually is quite slow, the rows are written in batches.
Example file: `helmi200.csv <https://www.astro.rug.nl/~breddels/vaex/helmi2000.csv>
# -*- coding: utf-8 -*-
import h5py
import sys
import numpy
h5file = h5py.File("example.hdf5", "w")
h5columns = h5file.create_group("columns") # vaex reads all datasets in the columns group
csv_file = open(sys.argv[1])
# first count the lines, start at -1 since the first line is assumed to contain the column names
line_count = -1
for line in csv_file:
line_count += 1
print "file contains", line_count, "rows"
csv_file.seek(0) # start from the beginning of the file again
lines = iter(csv_file) # explicitly create an iterator over the lines
# first line should contain the column names
header = lines.next()
columns = header.strip().split(",")
print "columns", columns
# assume all values are floats
Nbatch = 10000
h5_datasets = []
numpy_arrays = []
for column_name in columns:
dataset = h5columns.create_dataset(column_name, (line_count, ), dtype='f8')
h5_datasets.append(dataset)
numpy_arrays.append(numpy.zeros((Nbatch, ), dtype='f8'))
row = 0
# we read in Nbatch lines at a time, and then write them out
for line in lines:
# convert line to a series of float values
values = map(float, line.split(","))
for i in range(len(columns)):
#h5_datasets[i][row] = values[i]
index = row-int(row/Nbatch)*Nbatch
numpy_arrays[i][index] = values[i]
if ((row % 10000) == 0) and row > 0:
print "at", row, "of", line_count
# write out the array to disk
for i in range(len(columns)):
start = (int(row/Nbatch)-1)*Nbatch
end = (int(row/Nbatch))*Nbatch
h5_datasets[i][start:end] = numpy_arrays[i][:]
row += 1
if (row % 10000) > 0:
print "writing out last part"
for i in range(len(columns)):
start = (int(row/Nbatch))*Nbatch
end = line_count
h5_datasets[i][start:end] = numpy_arrays[i][:end-start]
PRINT, 'convert ascii file to hdf5'
testfile = '/Users/users/breddels/gavi/src/SubspaceFinding/data/helmi2000.asc'
h5file_id = H5F_CREATE('/tmp/test.hdf5')
N = 3300000; nr of rows
h5group_columns = H5G_CREATE(h5file_id, "columns") ; for vaex, all columns should be grouped under columns
h5type_id = H5T_IDL_CREATE(1.0d) ; create double datatype
h5data_id = H5S_CREATE_SIMPLE(N)
h5_E = H5D_CREATE(h5group_columns, 'E', h5type_id, h5data_id)
h5_L = H5D_CREATE(h5group_columns, 'L', h5type_id, h5data_id)
h5_Lz = H5D_CREATE(h5group_columns, 'Lz', h5type_id, h5data_id)
dataspace = H5D_GET_SPACE(h5_E)
FREE_LUN, 1
OPENR, 1, testfile
index = 0L
WHILE NOT EOF(1) DO BEGIN
READF, 1, E,L,Lz
if (index MOD 100000) EQ 0 then begin
print, index, ' of',N
end
H5S_SELECT_HYPERSLAB, dataspace, [index], [1], stride=[1], /RESET
memory_space_id = H5S_CREATE_SIMPLE([1])
H5D_WRITE, h5_E, [E], MEMORY_SPACE_ID=memory_space_id, FILE_SPACE_ID=dataspace
H5D_WRITE, h5_L, [L], MEMORY_SPACE_ID=memory_space_id, FILE_SPACE_ID=dataspace
H5D_WRITE, h5_Lz, [Lz], MEMORY_SPACE_ID=memory_space_id, FILE_SPACE_ID=dataspace
index = index + 1
ENDWHILE
H5F_CLOSE, h5file_id
FREE_LUN, 1
end
/*
compile as: gcc -Wall -std=c99 -o ascii_to_hdf5 ascii_to_hdf5.c -lhdf5
run as: ./ascii_to_hdf5 example.hdf5 ../../data/helmi2000-header.asc 3300000 3
arguments are: output filename, input filename, rows, columns
*/
#include "hdf5.h"
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <stdarg.h>
#include <errno.h>
#include <string.h>
#define MAX_COLUMNS 512
char column_names[MAX_COLUMNS][512];
static void
check (int test, const char * message, ...)
{
if (test) {
va_list args;
va_start (args, message);
vfprintf (stderr, message, args);
va_end (args);
fprintf (stderr, "\n");
exit (EXIT_FAILURE);
}
}
int main(int argc, char *argv[])
{
hid_t file; /* Handles */
herr_t status;
haddr_t offsets[MAX_COLUMNS];
hsize_t dims[1];
char* filename_output = argv[1];
char* filename_input = argv[2];
FILE* file_input = fopen(filename_input, "r");
int no_rows = atoi(argv[3]);
int no_columns = atoi(argv[4]);
dims[0] = no_rows;
// create the file and the group 'columns', which vaex will expect
file = H5Fcreate(filename_output, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
hid_t group = H5Gcreate1(file, "columns", 0);
// find the column names in the first line
for(int i=0; i<no_columns; i++) {
fscanf(file_input," %s", column_names[i]);
printf("column[%d]: %s\n", i, column_names[i]);
}
fscanf(file_input," \n");
// just create the dataspace using the HDF5 library, and ask for the offset from the beginning of the file
for(int i = 0; i < no_columns; i++) {
hid_t space = H5Screate_simple(1, dims, NULL);
hid_t dcpl = H5Pcreate (H5P_DATASET_CREATE);
H5Pset_layout (dcpl, H5D_CONTIGUOUS); // compact allows us the memory map the file
H5Pset_alloc_time(dcpl, H5D_ALLOC_TIME_EARLY); // need this to allocate the space so offset exists
hid_t dset = H5Dcreate(group, column_names[i], H5T_IEEE_F64LE, space, H5P_DEFAULT, dcpl, H5P_DEFAULT);
offsets[i] = H5Dget_offset(dset);
H5D_space_status_t space_status;
H5Dget_space_status(dset, &space_status);
printf("offset[%d] = %x allocated: %s\n", i, (unsigned int)offsets[i], (space_status == H5D_SPACE_STATUS_ALLOCATED ? "yes" : "no"));
status = H5Dclose (dset);
status = H5Pclose (dcpl);
status = H5Sclose (space);
}
//close the group and file
H5Gclose(group);
status = H5Fclose (file);
// now we can simpy memory map the file (meaning we tread the file as one big 'array'
// the offsets will tell us where we can write the columns
struct stat s;
status = stat(filename_output, &s);
check (status < 0, "stat %s failed: %s", filename_output, strerror (errno));
printf("file size: %lld\n", (unsigned long long)s.st_size);
int fd = open(filename_output, O_RDWR);
check (fd < 0, "open %s failed: %s", filename_output, strerror (errno));
// the mapped pointer points to the beginning of the file
char* mapped = mmap (0, s.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
check (mapped == MAP_FAILED, "mmap %s failed: %s",
filename_output, strerror (errno));
// read in the rows, and directly write them to the file
for(int j=0; j<no_rows; j++) {
for(int i=0; i<no_columns; i++) {
double* column_ptr = (double*)(mapped+offsets[i]);
fscanf(file_input," %lf", &column_ptr[j]);
}
if( ((j % 100000) == 0) & (j > 0) )
printf("%d of %d\n", j, no_rows);
}
printf("done!\n");
close(fd);
}