Skip to content

Commit

Permalink
draft support for GeoFrame data structure, Triangulation<1,1> is cons…
Browse files Browse the repository at this point in the history
…tructed from the number of nodes, not of subintervals, MdArray support for full_extent, buffered_istream, parsing utils (token_stream), csv implementation revisited to use buffered_istream. minor adjustments
  • Loading branch information
AlePalu committed Oct 16, 2024
1 parent d583eb8 commit c440a9d
Show file tree
Hide file tree
Showing 12 changed files with 1,044 additions and 197 deletions.
127 changes: 127 additions & 0 deletions fdaPDE/geoframe/batched_istream.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// This file is part of fdaPDE, a C++ library for physics-informed
// spatial and functional data analysis.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

#ifndef __BATCHED_ISTREAM_H__
#define __BATCHED_ISTREAM_H__

#include <filesystem>
#include <fstream>
#include <string>

namespace fdapde {
namespace internals {

// A buffered input stream implementation for block-reading of files
struct batched_istream_impl {
using char_t = char;
using size_t = std::size_t;
using buff_t = char_t*;
static constexpr size_t blk_sz__ = 1024; // block size is expressed as multiples of 1KB

batched_istream_impl() noexcept = default;
batched_istream_impl(const std::string& filename, size_t blk_factor = 4) :
stream_(), size_(0), blk_sz_(blk_sz__ * blk_factor), pos_(0) {
open(filename, blk_sz_);
}
batched_istream_impl(const char* filename, size_t blk_factor = 4) :
stream_(), size_(0), blk_sz_(blk_sz__ * blk_factor), pos_(0) {
open(filename, blk_sz_);
}
batched_istream_impl(const std::filesystem::path& filename, size_t blk_factor = 4) :
stream_(), size_(0), blk_sz_(blk_sz__ * blk_factor), pos_(0) {
open(filename, blk_sz_);
}
// read next block of data
void read() {
if (pos_ == 0) {
buff_sz_ = (n_blk_ == 1) ? size_ : blk_sz_;
} else {
buff_sz_ = (pos_ == (n_blk_ - 1)) ? size_ - (n_blk_ - 1) * blk_sz_ : blk_sz_;
}
// fetch block of data
stream_.read(buff_, buff_sz_);
pos_++;
}
// return number of valid bytes last extracted in buffer
size_t size() const { return buff_sz_; }
// pointer to read data
const char* data() const { return buff_; }
size_t tellg() const { return pos_; }
batched_istream_impl& seekg(size_t pos) {
fdapde_assert(pos < n_blk_);
pos_ = pos;
return *this;
}
bool end() { return stream_.peek() == EOF; }
operator bool() { return !end(); }
// counts how many newline characters '\n' are found in the file (requires to read the entire file)
size_t n_lines() {
size_t n = 0;
while (!end()) {
read();
for (size_t i = 0; i < size(); ++i) {
if (buff_[i] == '\n') { n++; }
}
}
// reset status
stream_.seekg(0, std::ios::beg); // rewind to the beginning
pos_ = 0;
return n;
}
// file operations
void close() {
delete[] buff_; // deallocate memory
// reset status
stream_.close();
size_ = 0, n_blk_ = 0, buff_sz_ = 0, pos_ = 0;
}
bool is_open() const { return stream_.is_open(); }
void open(const char* filename, size_t blk_sz) {
if (!std::filesystem::exists(filename))
throw std::runtime_error("file " + std::string(filename) + " not found.");
stream_.open(filename, std::ios::binary | std::ios::ate);
size_ = stream_.tellg();
n_blk_ = std::floor(size_ / blk_sz) + 1;
blk_sz_ = blk_sz;
stream_.seekg(0, std::ios::beg); // rewind to the beginning
buff_ = new char_t[blk_sz];
pos_ = 0;
}
void open(const std::string& filename, size_t blk_factor) { open(filename.c_str(), blk_factor); }
void open(const std::filesystem::path& filename, size_t blk_factor) { open(filename.c_str(), blk_factor); }
~batched_istream_impl() { close(); }
private:
std::ifstream stream_;
size_t size_; // size (in bytes) of stream
size_t blk_sz_; // blk_sz_ = blk_factor * blk_sz (size of block, expressed in KB)
size_t n_blk_; // number of blocks in stream
size_t buff_sz_; // number of valid bytes in r_buff and w_buff
size_t pos_; // index of last read block
buff_t buff_;
};

} // namespace internals

template <typename FileName> auto batched_istream(const FileName& filename, std::size_t blk_factor) {
return internals::batched_istream_impl(filename, blk_factor);
}
template <typename FileName> auto batched_istream(const FileName& filename) {
return internals::batched_istream_impl(filename);
}

} // namespace fdapde

#endif // __BATCHED_ISTREAM_H__
147 changes: 147 additions & 0 deletions fdaPDE/geoframe/csv.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
// This file is part of fdaPDE, a C++ library for physics-informed
// spatial and functional data analysis.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

#ifndef __CSV_H__
#define __CSV_H__

#include <limits>
#include <vector>

#include "batched_istream.h"
#include "parsing.h"

namespace fdapde {

// parser for CSV, Comma Separated Values (RFC 4180 compliant)
template <typename T> class CSVFile {
private:
template <typename CharBuff>
requires(internals::is_char_buff<CharBuff>)
T parse_value_(const CharBuff& token) const {
// check if token is recognized as na
if (std::find(na_values_.begin(), na_values_.end(), token) != na_values_.end()) {
return std::numeric_limits<T>::quiet_NaN();
}
// parse token as numeric
if constexpr (std::is_same_v<T, double>) { return internals::stod(token); }
if constexpr (std::is_same_v<T, int>) { return internals::stoi(token); }
return T {};
}

std::string_view& skipquote_(bool skip_quote, std::string_view& token) const {
if (skip_quote) { [[likely]]
if (!token.empty() && token.front() == '"') token.remove_prefix(1);
if (!token.empty() && token.back() == '"') token.remove_suffix(1);
}
return token;
}
// parsed data
std::vector<T> data_ {};
std::size_t n_cols_ = 0, n_rows_ = 0;
std::vector<std::string> colnames_ {};

std::vector<std::string> na_values_ = {"NA", "NaN", "nan"};
public:
CSVFile() = default;
CSVFile(
const char* filename, bool header, char sep, bool index_col, bool skip_quote = true, std::size_t chunksize = 4) :
n_cols_(0), n_rows_(0), colnames_() {
parse(filename, header, sep, index_col, skip_quote, chunksize);
}
CSVFile(const char* filename, bool index_col, bool skip_quote = true, std::size_t chunksize = 4) :
CSVFile(filename, true, ',', index_col, skip_quote, chunksize) { }
CSVFile(const std::string& filename, bool index_col, bool skip_quote = true, std::size_t chunksize = 4) :
CSVFile(filename.c_str(), index_col, skip_quote, chunksize) { }

Eigen::Map<const DMatrix<T, Eigen::RowMajor>> as_matrix() const {
return Eigen::Map<const DMatrix<T, Eigen::RowMajor>>(data_.data(), n_rows_, n_cols_);
}
// modifiers
void set_na_values(const std::vector<std::string>& na_values) { na_values_ = na_values; }
std::size_t cols() const { return n_cols_; }
std::size_t rows() const { return n_rows_; }
const std::vector<T>& data() const { return data_; }
const std::vector<std::string>& colnames() const { return colnames_; }
// parsing function
void parse(
const char* filename, bool header = true, char sep = ',', bool index_col = false, bool skip_quote = true,
std::size_t chunksize = 4) {
if (!std::filesystem::exists(filename))
throw std::runtime_error("file " + std::string(filename) + " not found.");
auto stream = batched_istream(filename, chunksize);
bool header_ = header;
std::size_t col_id = 0;
std::string last_token;

while (stream) {
stream.read();
const char* buff = stream.data();
// tokenize input stream
internals::token_stream token_stream_(buff, stream.size(), sep);

// PS: bug when file doesn't fit in chunksize

while (token_stream_) {
auto line = token_stream_.get_line();
if (header_) { [[unlikely]] // header parsing logic
header_ = false;
while (line.has_token()) {
std::string_view& token = skipquote_(skip_quote, line.get_token());
if (index_col == false && n_cols_ != 0) { colnames_.push_back(std::string(token)); }
n_cols_++;
++line;
}
} else { // data parsing logic
while (line.has_token()) {
if (index_col == false && col_id == 0) { // skip first column
} else {
std::string_view& token = skipquote_(skip_quote, line.get_token());
if (line.eof()) { // skip parsing and wait for next block
last_token = token;
} else {
if (!last_token.empty()) {
last_token = last_token + std::string(token); // merge tokens
data_.push_back(parse_value_(last_token));
last_token.clear();
} else if (!token.empty()) {
data_.push_back(parse_value_(token));
}
}
}
if (!line.eof() ) { col_id = (col_id + 1) % n_cols_; }
++line;
}
}
}
}
// process evantual last token of the last block of the stream
if (!last_token.empty()) { data_.push_back(parse_value_(last_token)); }
if (index_col == false) n_cols_ = n_cols_ - 1;
if (data_.size() % n_cols_ != 0) throw std::invalid_argument("csv parsing error.");
n_rows_ = data_.size() / n_cols_;
return;
}
};

// bug with index_col == true
template <typename T> CSVFile<T> read_csv(const std::string& filename, bool header = true, bool index_col = false) {
CSVFile<T> csv(filename.c_str(), header, /* sep = */ ',', index_col, /* skip_quote = */ true, /* chunksize = */ 1000);
return csv;
}

} // namespace fdapde

#endif // __CSV_H__
Loading

0 comments on commit c440a9d

Please sign in to comment.