draft support for GeoFrame data structure, Triangulation<1,1> is cons…

…tructed from the number of nodes, not of subintervals, MdArray support for full_extent, buffered_istream, parsing utils (token_stream), csv implementation revisited to use buffered_istream. minor adjustments
fdaPDE · Oct 16, 2024 · c440a9d · c440a9d
1 parent d583eb8
commit c440a9d
Show file tree

Hide file tree

Showing 12 changed files with 1,044 additions and 197 deletions.
diff --git a/fdaPDE/geoframe/batched_istream.h b/fdaPDE/geoframe/batched_istream.h
@@ -0,0 +1,127 @@
+// This file is part of fdaPDE, a C++ library for physics-informed
+// spatial and functional data analysis.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef __BATCHED_ISTREAM_H__
+#define __BATCHED_ISTREAM_H__
+
+#include <filesystem>
+#include <fstream>
+#include <string>
+
+namespace fdapde {
+namespace internals {
+
+// A buffered input stream implementation for block-reading of files
+struct batched_istream_impl {
+    using char_t = char;
+    using size_t = std::size_t;
+    using buff_t = char_t*;
+    static constexpr size_t blk_sz__ = 1024;   // block size is expressed as multiples of 1KB
+
+    batched_istream_impl() noexcept = default;
+    batched_istream_impl(const std::string& filename, size_t blk_factor = 4) :
+        stream_(), size_(0), blk_sz_(blk_sz__ * blk_factor), pos_(0) {
+        open(filename, blk_sz_);
+    }
+    batched_istream_impl(const char* filename, size_t blk_factor = 4) :
+        stream_(), size_(0), blk_sz_(blk_sz__ * blk_factor), pos_(0) {
+        open(filename, blk_sz_);
+    }
+    batched_istream_impl(const std::filesystem::path& filename, size_t blk_factor = 4) :
+        stream_(), size_(0), blk_sz_(blk_sz__ * blk_factor), pos_(0) {
+        open(filename, blk_sz_);
+    }
+    // read next block of data
+    void read() {
+        if (pos_ == 0) {
+            buff_sz_ = (n_blk_ == 1) ? size_ : blk_sz_;
+        } else {
+            buff_sz_ = (pos_ == (n_blk_ - 1)) ? size_ - (n_blk_ - 1) * blk_sz_ : blk_sz_;
+        }
+        // fetch block of data
+        stream_.read(buff_, buff_sz_);
+        pos_++;
+    }
+    // return number of valid bytes last extracted in buffer
+    size_t size() const { return buff_sz_; }
+    // pointer to read data
+    const char* data() const { return buff_; }
+    size_t tellg() const { return pos_; }
+    batched_istream_impl& seekg(size_t pos) {
+        fdapde_assert(pos < n_blk_);
+        pos_ = pos;
+        return *this;
+    }
+    bool end() { return stream_.peek() == EOF; }
+    operator bool() { return !end(); }
+    // counts how many newline characters '\n' are found in the file (requires to read the entire file)
+    size_t n_lines() {
+        size_t n = 0;
+        while (!end()) {
+            read();
+            for (size_t i = 0; i < size(); ++i) {
+                if (buff_[i] == '\n') { n++; }
+            }
+        }
+	// reset status
+	stream_.seekg(0, std::ios::beg);   // rewind to the beginning
+	pos_ = 0;
+        return n;
+    }
+    // file operations
+    void close() {
+        delete[] buff_;   // deallocate memory
+        // reset status
+        stream_.close();
+        size_ = 0, n_blk_ = 0, buff_sz_ = 0, pos_ = 0;
+    }
+    bool is_open() const { return stream_.is_open(); }
+    void open(const char* filename, size_t blk_sz) {
+        if (!std::filesystem::exists(filename))
+            throw std::runtime_error("file " + std::string(filename) + " not found.");
+        stream_.open(filename, std::ios::binary | std::ios::ate);
+        size_ = stream_.tellg();
+        n_blk_ = std::floor(size_ / blk_sz) + 1;
+	blk_sz_ = blk_sz;
+        stream_.seekg(0, std::ios::beg);   // rewind to the beginning
+        buff_ = new char_t[blk_sz];
+        pos_ = 0;
+    }
+    void open(const std::string& filename, size_t blk_factor) { open(filename.c_str(), blk_factor); }
+    void open(const std::filesystem::path& filename, size_t blk_factor) { open(filename.c_str(), blk_factor); }
+    ~batched_istream_impl() { close(); }
+   private:
+    std::ifstream stream_;
+    size_t size_;      // size (in bytes) of stream
+    size_t blk_sz_;    // blk_sz_ = blk_factor * blk_sz (size of block, expressed in KB)
+    size_t n_blk_;     // number of blocks in stream
+    size_t buff_sz_;   // number of valid bytes in r_buff and w_buff
+    size_t pos_;       // index of last read block
+    buff_t buff_;
+};
+
+}   // namespace internals
+
+template <typename FileName> auto batched_istream(const FileName& filename, std::size_t blk_factor) {
+    return internals::batched_istream_impl(filename, blk_factor);
+}
+template <typename FileName> auto batched_istream(const FileName& filename) {
+    return internals::batched_istream_impl(filename);
+}
+
+}   // namespace fdapde
+
+#endif   // __BATCHED_ISTREAM_H__
diff --git a/fdaPDE/geoframe/csv.h b/fdaPDE/geoframe/csv.h
@@ -0,0 +1,147 @@
+// This file is part of fdaPDE, a C++ library for physics-informed
+// spatial and functional data analysis.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef __CSV_H__
+#define __CSV_H__
+
+#include <limits>
+#include <vector>
+
+#include "batched_istream.h"
+#include "parsing.h"
+
+namespace fdapde {
+
+// parser for CSV, Comma Separated Values (RFC 4180 compliant)
+template <typename T> class CSVFile {
+   private:
+    template <typename CharBuff>
+        requires(internals::is_char_buff<CharBuff>)
+    T parse_value_(const CharBuff& token) const {
+        // check if token is recognized as na
+        if (std::find(na_values_.begin(), na_values_.end(), token) != na_values_.end()) {
+            return std::numeric_limits<T>::quiet_NaN();
+        }
+	// parse token as numeric
+        if constexpr (std::is_same_v<T, double>) { return internals::stod(token); }
+        if constexpr (std::is_same_v<T, int>) { return internals::stoi(token); }
+        return T {};
+    }
+
+    std::string_view& skipquote_(bool skip_quote, std::string_view& token) const {
+        if (skip_quote) { [[likely]]
+            if (!token.empty() && token.front() == '"') token.remove_prefix(1);
+            if (!token.empty() && token.back()  == '"') token.remove_suffix(1);
+        }
+        return token;
+    }
+    // parsed data
+    std::vector<T> data_ {};
+    std::size_t n_cols_ = 0, n_rows_ = 0;
+    std::vector<std::string> colnames_ {};
+
+    std::vector<std::string> na_values_ = {"NA", "NaN", "nan"};
+   public:
+    CSVFile() = default;
+    CSVFile(
+      const char* filename, bool header, char sep, bool index_col, bool skip_quote = true, std::size_t chunksize = 4) :
+        n_cols_(0), n_rows_(0), colnames_() {
+        parse(filename, header, sep, index_col, skip_quote, chunksize);
+    }
+    CSVFile(const char* filename, bool index_col, bool skip_quote = true, std::size_t chunksize = 4) :
+        CSVFile(filename, true, ',', index_col, skip_quote, chunksize) { }
+    CSVFile(const std::string& filename, bool index_col, bool skip_quote = true, std::size_t chunksize = 4) :
+        CSVFile(filename.c_str(), index_col, skip_quote, chunksize) { }
+
+    Eigen::Map<const DMatrix<T, Eigen::RowMajor>> as_matrix() const {
+        return Eigen::Map<const DMatrix<T, Eigen::RowMajor>>(data_.data(), n_rows_, n_cols_);
+    }
+    // modifiers
+    void set_na_values(const std::vector<std::string>& na_values) { na_values_ = na_values; }
+    std::size_t cols() const { return n_cols_; }
+    std::size_t rows() const { return n_rows_; }
+    const std::vector<T>& data() const { return data_; }
+    const std::vector<std::string>& colnames() const { return colnames_; }
+    // parsing function
+    void parse(
+      const char* filename, bool header = true, char sep = ',', bool index_col = false, bool skip_quote = true,
+      std::size_t chunksize = 4) {
+        if (!std::filesystem::exists(filename))
+            throw std::runtime_error("file " + std::string(filename) + " not found.");
+        auto stream = batched_istream(filename, chunksize); 
+        bool header_ = header;
+	std::size_t col_id = 0;
+        std::string last_token;
+
+        while (stream) {
+            stream.read();
+            const char* buff = stream.data();
+            // tokenize input stream
+            internals::token_stream token_stream_(buff, stream.size(), sep);
+
+	    // PS: bug when file doesn't fit in chunksize
+
+            while (token_stream_) {
+                auto line = token_stream_.get_line();
+                if (header_) { [[unlikely]]   // header parsing logic
+                    header_ = false;
+                    while (line.has_token()) {
+                        std::string_view& token = skipquote_(skip_quote, line.get_token());
+                        if (index_col == false && n_cols_ != 0) { colnames_.push_back(std::string(token)); }
+                        n_cols_++;
+                        ++line;
+                    }
+                } else {   // data parsing logic
+                    while (line.has_token()) {
+                        if (index_col == false && col_id == 0) {   // skip first column
+                        } else {
+                            std::string_view& token = skipquote_(skip_quote, line.get_token());
+                            if (line.eof()) {   // skip parsing and wait for next block
+                                last_token = token;
+                            } else {
+                                if (!last_token.empty()) {
+                                    last_token = last_token + std::string(token);   // merge tokens
+                                    data_.push_back(parse_value_(last_token));
+                                    last_token.clear();
+                                } else if (!token.empty()) {
+                                    data_.push_back(parse_value_(token));
+                                }
+                            }
+                        }
+                        if (!line.eof() ) { col_id = (col_id + 1) % n_cols_; }
+                        ++line;
+                    }
+                }
+            }
+        }
+        // process evantual last token of the last block of the stream
+        if (!last_token.empty()) { data_.push_back(parse_value_(last_token)); }
+        if (index_col == false) n_cols_ = n_cols_ - 1;		
+        if (data_.size() % n_cols_ != 0) throw std::invalid_argument("csv parsing error.");
+        n_rows_ = data_.size() / n_cols_;
+        return;
+    }
+};
+
+  // bug with index_col == true
+template <typename T> CSVFile<T> read_csv(const std::string& filename, bool header = true, bool index_col = false) {
+    CSVFile<T> csv(filename.c_str(), header, /* sep = */ ',', index_col, /* skip_quote = */ true, /* chunksize = */ 1000);
+    return csv;
+}
+
+}   // namespace fdapde
+
+#endif // __CSV_H__