Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve line parser #294

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
12 changes: 12 additions & 0 deletions .circleci/config.yml
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The project is using Github Action.

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: 2
jobs:
build:
parameters:
ruby-version:
type: string
docker:
- image: cimg/ruby:3.2.0
steps:
- checkout
- run: bundle install
- run: bundle exec rake spec
5 changes: 5 additions & 0 deletions Gemfile
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Github Action has a matrix for compatibility that test many Ruby version and ActiveSupport.
FastCSV should be added to the gemspec that is the equivalent to a Gemfile for a library.

All changes in this file should be reverted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
source "https://rubygems.org"

ruby '3.2.0'

gem 'fastcsv'
gem 'activesupport', '~> 7.0.8.4'

# Specify your gem's dependencies in csvlint.rb.gemspec
gemspec
2 changes: 1 addition & 1 deletion lib/csvlint.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
require "csv"
require "fastcsv"
require "date"
require "open-uri"
require "tempfile"
Expand Down
85 changes: 20 additions & 65 deletions lib/csvlint/validate.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module Csvlint
class Validator
class LineCSV < CSV
class LineCSV < FastCSV
ENCODE_RE = Hash.new do |h, str|
h[str] = Regexp.new(str)
end
Expand Down Expand Up @@ -77,7 +77,6 @@ def initialize(source, dialect = {}, schema = nil, options = {})
@extension = parse_extension(source) unless @source.nil?

@expected_columns = 0
@col_counts = []
@line_breaks = []

@errors += @schema.errors unless @schema.nil?
Expand All @@ -90,7 +89,6 @@ def initialize(source, dialect = {}, schema = nil, options = {})

def validate
if /.xls(x)?/.match?(@extension)
build_warnings(:excel, :context)
return
end
locate_schema unless @schema.instance_of?(Csvlint::Schema)
Expand Down Expand Up @@ -185,22 +183,29 @@ def parse_contents(stream, line = nil)
@csv_options[:encoding] = @encoding

begin
row = LineCSV.parse_line(stream, **@csv_options)
rescue LineCSV::MalformedCSVError => e
row = nil
LineCSV.raw_parse(stream, @csv_options) do |raw_row|
row = raw_row
end
rescue FastCSV::MalformedCSVError => e
build_exception_messages(e, stream, current_line) unless e.message.include?("UTF") && @reported_invalid_encoding
end

if row != nil
row = row.map { |r| r == nil ? "" : r }
end

if row
if current_line <= 1 && @csv_header
# this conditional should be refactored somewhere
row = row.reject { |col| col.nil? || col.empty? }
row = row.reject { |col| col.nil? }
validate_header(row)
@col_counts << row.size
else
build_formats(row)
@col_counts << row.reject { |col| col.nil? || col.empty? }.size
@expected_columns = row.size unless @expected_columns != 0
build_errors(:blank_rows, :structure, current_line, nil, stream.to_s) if row.reject { |c| c.nil? || c.empty? }.size == 0
unless @csv_options[:skip_blanks]
build_errors(:blank_rows, :structure, current_line, nil, stream.to_s) if row.reject { |c| c.nil? }.size == 0
end
# Builds errors and warnings related to the provided schema file
if @schema
@schema.validate_row(row, current_line, all_errors, @source, @validate)
Expand All @@ -216,16 +221,8 @@ def parse_contents(stream, line = nil)
end

def finish
sum = @col_counts.inject(:+)
unless sum.nil?
build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
end
# return expected_columns to calling class
build_warnings(:check_options, :structure) if @expected_columns == 1
check_consistency
check_foreign_keys if @validate
check_mixed_linebreaks
validate_encoding
end

def validate_metadata
Expand All @@ -240,7 +237,6 @@ def validate_metadata
@csv_header = false if $1 == "absent"
assumed_header = false
end
build_warnings(:no_content_type, :context) if @content_type.nil?
build_errors(:wrong_content_type, :context) unless @content_type && @content_type =~ /text\/csv/
end
@header_processed = true
Expand Down Expand Up @@ -278,7 +274,6 @@ def validate_metadata
@schema = schema
else
warn_if_unsuccessful = true
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
end
end
rescue OpenURI::HTTPError
Expand All @@ -296,7 +291,7 @@ def report_line_breaks(line_no = nil)
line_break = get_line_break(@input)
@line_breaks << line_break
unless line_breaks_reported?
if line_break != "\r\n"
if line_break != "\n"
build_info_messages(:nonrfc_line_breaks, :structure, line_no)
@line_breaks_reported = true
end
Expand Down Expand Up @@ -330,17 +325,6 @@ def set_dialect
@csv_options = dialect_to_csv_options(@dialect)
end

def validate_encoding
if @headers["content-type"]
if !/charset=/.match?(@headers["content-type"])
build_warnings(:no_encoding, :context)
elsif !/charset=utf-8/i.match?(@headers["content-type"])
build_warnings(:encoding, :context)
end
end
build_warnings(:encoding, :context) if @encoding != "UTF-8"
end

def check_mixed_linebreaks
build_linebreak_error if @line_breaks.uniq.count > 1
end
Expand Down Expand Up @@ -376,10 +360,7 @@ def validate_header(header)
names = Set.new
header.map { |h| h.strip! } if @dialect["trim"] == :true
header.each_with_index do |name, i|
build_warnings(:empty_column_name, :schema, nil, i + 1) if name == ""
if names.include?(name)
build_warnings(:duplicate_column_name, :schema, nil, i + 1)
else
if !names.include?(name)
names << name
end
end
Expand All @@ -405,45 +386,26 @@ def dialect_to_csv_options(dialect)
skipinitialspace = dialect["skipInitialSpace"] || true
delimiter = dialect["delimiter"]
delimiter += " " if !skipinitialspace
skipblanks = dialect["skip_blanks"] || false
{
col_sep: delimiter,
row_sep: dialect["lineTerminator"],
quote_char: dialect["quoteChar"],
skip_blanks: false
skip_blanks: skipblanks
}
end

def build_formats(row)
row.each_with_index do |col, i|
next if col.nil? || col.empty?
next if col.nil?
@formats[i] ||= Hash.new(0)

format =
if col.strip[FORMATS[:numeric]]
:numeric
elsif uri?(col)
:uri
elsif possible_date?(col)
date_formats(col)
else
:string
end
format = :string

@formats[i][format] += 1
end
end

def check_consistency
@formats.each_with_index do |format, i|
if format
total = format.values.reduce(:+).to_f
if format.none? { |_, count| count / total >= 0.9 }
build_warnings(:inconsistent_values, :schema, nil, i + 1)
end
end
end
end

def check_foreign_keys
if @schema.instance_of? Csvlint::Csvw::TableGroup
@schema.validate_foreign_keys
Expand Down Expand Up @@ -492,15 +454,13 @@ def locate_schema
return
else
warn_if_unsuccessful = true
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
end
end
rescue Errno::ENOENT
rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError
rescue => e
raise e
end
build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) if warn_if_unsuccessful
@schema = nil
end

Expand Down Expand Up @@ -577,12 +537,7 @@ def line_limit_reached?
end

def get_line_break(line)
eol = line.chars.last(2)
if eol.first == "\r"
"\r\n"
else
"\n"
end
end

FORMATS = {
Expand Down
Loading
Loading