diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp index 9521b1054e6d09..78b3db4e8fe5fc 100644 --- a/be/src/format/csv/csv_reader.cpp +++ b/be/src/format/csv/csv_reader.cpp @@ -579,14 +579,13 @@ Status CsvReader::_create_line_reader() { } else { // in load task, the _file_slot_descs is empty vector, so we need to set col_sep_num to 0 size_t col_sep_num = _file_slot_descs.size() > 1 ? _file_slot_descs.size() - 1 : 0; - text_line_reader_ctx = std::make_shared( + _enclose_reader_ctx = std::make_shared( _line_delimiter, _line_delimiter_length, _value_separator, _value_separator_length, col_sep_num, _enclose, _escape, _keep_cr); + text_line_reader_ctx = _enclose_reader_ctx; _fields_splitter = std::make_unique( - _trim_tailing_spaces, true, - std::static_pointer_cast(text_line_reader_ctx), - _value_separator_length, _enclose); + _trim_tailing_spaces, true, _enclose_reader_ctx, _value_separator_length, _enclose); } switch (_file_format_type) { case TFileFormatType::FORMAT_CSV_PLAIN: @@ -802,8 +801,15 @@ Status CsvReader::_parse_col_types(size_t col_nums, std::vector* co const uint8_t* CsvReader::_remove_bom(const uint8_t* ptr, size_t& size) { if (size >= 3 && ptr[0] == 0xEF && ptr[1] == 0xBB && ptr[2] == 0xBF) { LOG(INFO) << "remove bom"; - size -= 3; - return ptr + 3; + constexpr size_t bom_size = 3; + size -= bom_size; + // In enclose mode, column_sep_positions were computed on the original line + // (including BOM). After shifting the pointer, we must adjust those positions + // so they remain correct relative to the new start. + if (_enclose_reader_ctx) { + _enclose_reader_ctx->adjust_column_sep_positions(bom_size); + } + return ptr + bom_size; } return ptr; } diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h index 36ff41084d557c..40fc0122801aab 100644 --- a/be/src/format/csv/csv_reader.h +++ b/be/src/format/csv/csv_reader.h @@ -277,6 +277,9 @@ class CsvReader : public GenericReader { bool _empty_field_as_null = false; io::IOContext* _io_ctx = nullptr; + std::shared_ptr _io_ctx_holder; + // Stored to adjust column_sep_positions when BOM is removed in enclose mode + std::shared_ptr _enclose_reader_ctx; // save source text which have been splitted. std::vector _split_values; std::vector _use_nullable_string_opt; diff --git a/be/src/format/file_reader/new_plain_text_line_reader.h b/be/src/format/file_reader/new_plain_text_line_reader.h index b2a38cdd8d34f9..05a14423231cee 100644 --- a/be/src/format/file_reader/new_plain_text_line_reader.h +++ b/be/src/format/file_reader/new_plain_text_line_reader.h @@ -184,6 +184,15 @@ class EncloseCsvLineReaderCtx final : public BaseTextLineReaderContext