Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions be/src/format/csv/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -579,14 +579,13 @@ Status CsvReader::_create_line_reader() {
} else {
// in load task, the _file_slot_descs is empty vector, so we need to set col_sep_num to 0
size_t col_sep_num = _file_slot_descs.size() > 1 ? _file_slot_descs.size() - 1 : 0;
text_line_reader_ctx = std::make_shared<EncloseCsvLineReaderCtx>(
_enclose_reader_ctx = std::make_shared<EncloseCsvLineReaderCtx>(
_line_delimiter, _line_delimiter_length, _value_separator, _value_separator_length,
col_sep_num, _enclose, _escape, _keep_cr);
text_line_reader_ctx = _enclose_reader_ctx;

_fields_splitter = std::make_unique<EncloseCsvTextFieldSplitter>(
_trim_tailing_spaces, true,
std::static_pointer_cast<EncloseCsvLineReaderCtx>(text_line_reader_ctx),
_value_separator_length, _enclose);
_trim_tailing_spaces, true, _enclose_reader_ctx, _value_separator_length, _enclose);
}
switch (_file_format_type) {
case TFileFormatType::FORMAT_CSV_PLAIN:
Expand Down Expand Up @@ -802,8 +801,15 @@ Status CsvReader::_parse_col_types(size_t col_nums, std::vector<DataTypePtr>* co
const uint8_t* CsvReader::_remove_bom(const uint8_t* ptr, size_t& size) {
if (size >= 3 && ptr[0] == 0xEF && ptr[1] == 0xBB && ptr[2] == 0xBF) {
LOG(INFO) << "remove bom";
size -= 3;
return ptr + 3;
constexpr size_t bom_size = 3;
size -= bom_size;
// In enclose mode, column_sep_positions were computed on the original line
// (including BOM). After shifting the pointer, we must adjust those positions
// so they remain correct relative to the new start.
if (_enclose_reader_ctx) {
_enclose_reader_ctx->adjust_column_sep_positions(bom_size);
}
return ptr + bom_size;
}
return ptr;
}
Expand Down
3 changes: 3 additions & 0 deletions be/src/format/csv/csv_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,9 @@ class CsvReader : public GenericReader {
bool _empty_field_as_null = false;

io::IOContext* _io_ctx = nullptr;
std::shared_ptr<io::IOContext> _io_ctx_holder;
// Stored to adjust column_sep_positions when BOM is removed in enclose mode
std::shared_ptr<EncloseCsvLineReaderCtx> _enclose_reader_ctx;
// save source text which have been splitted.
std::vector<Slice> _split_values;
std::vector<int> _use_nullable_string_opt;
Expand Down
9 changes: 9 additions & 0 deletions be/src/format/file_reader/new_plain_text_line_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,15 @@ class EncloseCsvLineReaderCtx final : public BaseTextLineReaderContext<EncloseCs
return _column_sep_positions;
}

// Adjust column separator positions by subtracting the given offset.
// Used when BOM bytes are removed from the beginning of a line,
// shifting all positions by the BOM size.
void adjust_column_sep_positions(size_t offset) {
for (auto& pos : _column_sep_positions) {
pos -= offset;
}
}

const uint8_t* read_line_impl(const uint8_t* start, size_t length);

private:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"14","bom_test,data",2023-08-01,"hello,world","2023-08-01 12:00:00","bom,value"
"15","normal",2023-08-02,"test","2023-08-02 13:00:00","data"
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@
11 abc,def 2023-07-15 ghi 2023-07-20T05:48:31 jkl\nmne
12 {"a": 1} 2023-07-15 def 2023-07-20T05:48:31 {"a": 1}
13 {"a": 2} 2023-07-15 def 2023-07-20T05:48:31 {"a": 2}
14 bom_test,data 2023-08-01 hello,world 2023-08-01T12:00 bom,value
15 normal 2023-08-02 test 2023-08-02T13:00 data

Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,16 @@ suite("test_csv_with_enclose_and_escape", "p0") {
file "enclose_with_same_escape.csv"
}

// test CSV file with UTF-8 BOM and enclose
streamLoad {
table "${tableName}"
set 'column_separator', ','
set 'enclose', "\""
set 'escape', '\\'

file "enclose_with_bom.csv"
}

sql "sync"
qt_select """
SELECT * FROM ${tableName} ORDER BY k1, k2
Expand Down
Loading