From 7f6cd4ef3d8de1d7ef17168585e9afb30415e8ac Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Tue, 2 Jun 2026 06:19:44 +0000 Subject: [PATCH 1/2] feat: add data_converter_utils, field_type_utils, file_type and decimal_utils --- .../common/utils/data_converter_utils.h | 280 ++++++++++++++++++ .../utils/data_converter_utils_test.cpp | 180 +++++++++++ src/paimon/common/utils/decimal_utils.cpp | 103 +++++++ src/paimon/common/utils/decimal_utils.h | 52 ++++ .../common/utils/decimal_utils_test.cpp | 36 +++ src/paimon/common/utils/field_type_utils.h | 131 ++++++++ .../common/utils/field_type_utils_test.cpp | 134 +++++++++ src/paimon/common/utils/file_type.cpp | 128 ++++++++ src/paimon/common/utils/file_type.h | 43 +++ src/paimon/common/utils/file_type_test.cpp | 183 ++++++++++++ 10 files changed, 1270 insertions(+) create mode 100644 src/paimon/common/utils/data_converter_utils.h create mode 100644 src/paimon/common/utils/data_converter_utils_test.cpp create mode 100644 src/paimon/common/utils/decimal_utils.cpp create mode 100644 src/paimon/common/utils/decimal_utils.h create mode 100644 src/paimon/common/utils/decimal_utils_test.cpp create mode 100644 src/paimon/common/utils/field_type_utils.h create mode 100644 src/paimon/common/utils/field_type_utils_test.cpp create mode 100644 src/paimon/common/utils/file_type.cpp create mode 100644 src/paimon/common/utils/file_type.h create mode 100644 src/paimon/common/utils/file_type_test.cpp diff --git a/src/paimon/common/utils/data_converter_utils.h b/src/paimon/common/utils/data_converter_utils.h new file mode 100644 index 0000000..7328988 --- /dev/null +++ b/src/paimon/common/utils/data_converter_utils.h @@ -0,0 +1,280 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/type.h" +#include "fmt/core.h" +#include "fmt/format.h" +#include "fmt/ranges.h" +#include "paimon/common/data/binary_row.h" +#include "paimon/common/data/binary_row_writer.h" +#include "paimon/common/data/binary_string.h" +#include "paimon/common/utils/string_utils.h" +#include "paimon/core/casting/date_to_string_cast_executor.h" +#include "paimon/defs.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { +class MemoryPool; + +#define RETURN_INVALID_WITH_FIELD_INFO(value, field_idx, value_str, type) \ + if ((value) == std::nullopt) { \ + return Status::Invalid( \ + fmt::format("cannot convert field idx {}, field value {} to type {}", field_idx, \ + value_str, type)); \ + } + +class DataConverterUtils { + public: + DataConverterUtils() = delete; + ~DataConverterUtils() = delete; + + using StrToBinaryRowConverter = + std::function; + using BinaryRowFieldToStrConverter = + std::function(const BinaryRow&, int32_t)>; + + static Result CreateDataToBinaryRowConverter(arrow::Type::type type, + MemoryPool* pool) { + StrToBinaryRowConverter converter; + switch (type) { + case arrow::Type::BOOL: + converter = [](const std::string& value_str, int32_t field_idx, + BinaryRowWriter* writer) { + auto value = StringUtils::StringToValue(value_str); + RETURN_INVALID_WITH_FIELD_INFO(value, field_idx, value_str, + arrow::internal::ToString(arrow::Type::BOOL)); + writer->WriteBoolean(field_idx, value.value()); + return Status::OK(); + }; + break; + case arrow::Type::INT8: + converter = [](const std::string& value_str, int32_t field_idx, + BinaryRowWriter* writer) { + auto value = StringUtils::StringToValue(value_str); + RETURN_INVALID_WITH_FIELD_INFO(value, field_idx, value_str, + arrow::internal::ToString(arrow::Type::INT8)); + writer->WriteByte(field_idx, value.value()); + return Status::OK(); + }; + break; + case arrow::Type::INT16: + converter = [](const std::string& value_str, int32_t field_idx, + BinaryRowWriter* writer) { + auto value = StringUtils::StringToValue(value_str); + RETURN_INVALID_WITH_FIELD_INFO(value, field_idx, value_str, + arrow::internal::ToString(arrow::Type::INT16)); + writer->WriteShort(field_idx, value.value()); + return Status::OK(); + }; + break; + case arrow::Type::INT32: + converter = [](const std::string& value_str, int32_t field_idx, + BinaryRowWriter* writer) { + auto value = StringUtils::StringToValue(value_str); + RETURN_INVALID_WITH_FIELD_INFO(value, field_idx, value_str, + arrow::internal::ToString(arrow::Type::INT32)); + writer->WriteInt(field_idx, value.value()); + return Status::OK(); + }; + break; + case arrow::Type::INT64: + converter = [](const std::string& value_str, int32_t field_idx, + BinaryRowWriter* writer) { + auto value = StringUtils::StringToValue(value_str); + RETURN_INVALID_WITH_FIELD_INFO(value, field_idx, value_str, + arrow::internal::ToString(arrow::Type::INT64)); + writer->WriteLong(field_idx, value.value()); + return Status::OK(); + }; + break; + case arrow::Type::FLOAT: + converter = [](const std::string& value_str, int32_t field_idx, + BinaryRowWriter* writer) { + auto value = StringUtils::StringToValue(value_str); + RETURN_INVALID_WITH_FIELD_INFO(value, field_idx, value_str, + arrow::internal::ToString(arrow::Type::FLOAT)); + writer->WriteFloat(field_idx, value.value()); + return Status::OK(); + }; + break; + case arrow::Type::DOUBLE: + converter = [](const std::string& value_str, int32_t field_idx, + BinaryRowWriter* writer) { + auto value = StringUtils::StringToValue(value_str); + RETURN_INVALID_WITH_FIELD_INFO(value, field_idx, value_str, + arrow::internal::ToString(arrow::Type::DOUBLE)); + writer->WriteDouble(field_idx, value.value()); + return Status::OK(); + }; + break; + case arrow::Type::STRING: + converter = [pool](const std::string& value_str, int32_t field_idx, + BinaryRowWriter* writer) { + BinaryString value = BinaryString::FromString(value_str, pool); + writer->WriteString(field_idx, value); + return Status::OK(); + }; + break; + case arrow::Type::DATE32: + converter = [](const std::string& value_str, int32_t field_idx, + BinaryRowWriter* writer) { + PAIMON_ASSIGN_OR_RAISE(int32_t date_value, + StringUtils::StringToDate(value_str)); + writer->WriteInt(field_idx, date_value); + return Status::OK(); + }; + break; + default: + return Status::NotImplemented( + fmt::format("Do not support type {} in partition binary row", + arrow::internal::ToString(type))); + } + return converter; + } + + // support float and double + template + static std::string FloatValueToString(const T& value, int32_t precision) { + std::stringstream oss; + if (value >= 1e-3 && value <= 1e7) { + oss << std::fixed << std::setprecision(sizeof(T)) << value; + std::string result = oss.str(); + auto pos = result.find_last_not_of('0'); + result.erase(pos + (result[pos] == '.') + 1, std::string::npos); + return result; + } + oss << std::uppercase << std::scientific << std::setprecision(precision) << value; + std::string result = oss.str(); + auto e_pos = result.find('E'); + if (e_pos != std::string::npos) { + if (result[e_pos + 1] == '+') { + result.erase(e_pos + 1, 1 + (result[e_pos + 2] == '0')); + } else { + if (result[e_pos + 1] == '-' && result[e_pos + 2] == '0') { + result.erase(e_pos + 2, 1); + } + } + auto zero_pos = e_pos - 1; + while (zero_pos >= 1 && result[zero_pos] == '0' && result[zero_pos - 1] != '.') { + zero_pos--; + } + if (e_pos - zero_pos - 1 > 0) { + result.erase(zero_pos + 1, e_pos - zero_pos - 1); + } + } + return result; + } + + static Result CreateBinaryRowFieldToStringConverter( + arrow::Type::type type, bool legacy_partition_name_enabled) { + BinaryRowFieldToStrConverter converter; + switch (type) { + case arrow::Type::BOOL: + converter = [](const BinaryRow& row, int32_t field_idx) { + bool data = row.GetBoolean(field_idx); + std::string result = data ? "true" : "false"; + return result; + }; + break; + case arrow::Type::INT8: + converter = [](const BinaryRow& row, int32_t field_idx) { + auto data = static_cast(row.GetByte(field_idx)); + return std::to_string(data); + }; + break; + case arrow::Type::INT16: + converter = [](const BinaryRow& row, int32_t field_idx) { + auto data = row.GetShort(field_idx); + return std::to_string(data); + }; + break; + case arrow::Type::INT32: + converter = [](const BinaryRow& row, int32_t field_idx) { + auto data = row.GetInt(field_idx); + return std::to_string(data); + }; + break; + case arrow::Type::INT64: + converter = [](const BinaryRow& row, int32_t field_idx) { + auto data = row.GetLong(field_idx); + return std::to_string(data); + }; + break; + case arrow::Type::FLOAT: + converter = [](const BinaryRow& row, int32_t field_idx) { + float data = row.GetFloat(field_idx); + return FloatValueToString(data, 6); + }; + break; + case arrow::Type::DOUBLE: + converter = [](const BinaryRow& row, int32_t field_idx) { + double data = row.GetDouble(field_idx); + return FloatValueToString(data, 15); + }; + break; + case arrow::Type::STRING: + converter = [](const BinaryRow& row, int32_t field_idx) { + BinaryString data = row.GetString(field_idx); + return data.ToString(); + }; + break; + case arrow::Type::DATE32: { + if (legacy_partition_name_enabled) { + converter = [](const BinaryRow& row, int32_t field_idx) -> Result { + int32_t data = row.GetDate(field_idx); + return std::to_string(data); + }; + } else { + auto date_to_string_cast_executor = + std::make_shared(); + converter = [date_to_string_cast_executor]( + const BinaryRow& row, + int32_t field_idx) -> Result { + int32_t data = row.GetDate(field_idx); + PAIMON_ASSIGN_OR_RAISE(Literal literal, + date_to_string_cast_executor->Cast( + Literal(FieldType::DATE, data), arrow::utf8())); + return literal.GetValue(); + }; + } + break; + } + default: + return Status::NotImplemented( + fmt::format("Do not support arrow {} in partition binary row", + arrow::internal::ToString(type))); + } + return converter; + } +}; + +} // namespace paimon diff --git a/src/paimon/common/utils/data_converter_utils_test.cpp b/src/paimon/common/utils/data_converter_utils_test.cpp new file mode 100644 index 0000000..d7dbae7 --- /dev/null +++ b/src/paimon/common/utils/data_converter_utils_test.cpp @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "paimon/common/utils/data_converter_utils.h" + +#include +#include +#include + +#include "arrow/type_fwd.h" +#include "gtest/gtest.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { +TEST(DataConverterUtilsTest, TestDataToBinaryRowConverterWithLegacyPartitionName) { + auto pool = GetDefaultPool(); + std::vector> data = { + {"true", arrow::Type::BOOL}, + {"10", arrow::Type::INT8}, + {"-20", arrow::Type::INT8}, + {"1556", arrow::Type::INT16}, + {"-2556", arrow::Type::INT16}, + {"348489", arrow::Type::INT32}, + {"-448489", arrow::Type::INT32}, + {"279039", arrow::Type::INT64}, + {"1234567", arrow::Type::INT64}, + {"0.334", arrow::Type::FLOAT}, + {"467.66472", arrow::Type::DOUBLE}, + {"abcde", arrow::Type::STRING}, + {"这是一个很长很长的中文", arrow::Type::STRING}, + {"10440", arrow::Type::DATE32}}; + + std::vector converters; + std::vector reconverters; + for (const auto& [value, type] : data) { + ASSERT_OK_AND_ASSIGN(auto converter, + DataConverterUtils::CreateDataToBinaryRowConverter(type, pool.get())); + converters.emplace_back(std::move(converter)); + ASSERT_OK_AND_ASSIGN(auto reconverter, + DataConverterUtils::CreateBinaryRowFieldToStringConverter( + type, /*legacy_partition_name_enabled=*/true)); + reconverters.emplace_back(reconverter); + } + // test not implement type + ASSERT_NOK(DataConverterUtils::CreateDataToBinaryRowConverter(arrow::Type::LIST, pool.get())); + + BinaryRow row(data.size()); + BinaryRowWriter writer(&row, 0, pool.get()); + for (size_t idx = 0; idx < data.size(); idx++) { + ASSERT_OK(converters[idx](data[idx].first, idx, &writer)); + } + // test invalid str + ASSERT_NOK(converters[0]("abc", /*idx=*/0, &writer)); + writer.Complete(); + + ASSERT_EQ(data.size(), row.GetFieldCount()); + ASSERT_EQ(true, row.GetBoolean(0)); + ASSERT_EQ(10, row.GetByte(1)); + ASSERT_EQ(-20, row.GetByte(2)); + ASSERT_EQ(1556, row.GetShort(3)); + ASSERT_EQ(-2556, row.GetShort(4)); + ASSERT_EQ(348489, row.GetInt(5)); + ASSERT_EQ(-448489, row.GetInt(6)); + ASSERT_EQ(279039, row.GetLong(7)); + ASSERT_EQ(1234567, row.GetLong(8)); + ASSERT_NEAR(0.334, row.GetFloat(9), 0.0000001); + ASSERT_NEAR(467.66472, row.GetDouble(10), 0.0000001); + ASSERT_EQ("abcde", row.GetString(11).ToString()); + ASSERT_EQ("这是一个很长很长的中文", row.GetString(12).ToString()); + ASSERT_EQ(10440, row.GetDate(13)); + + for (size_t idx = 0; idx < data.size(); idx++) { + ASSERT_OK_AND_ASSIGN(auto partition_field_str, reconverters[idx](row, idx)); + ASSERT_EQ(data[idx].first, partition_field_str); + } +} + +TEST(DataConverterUtilsTest, TestDataToBinaryRowConverterWithNoLegacyPartitionName) { + auto pool = GetDefaultPool(); + std::vector> data = { + {"true", arrow::Type::BOOL}, + {"10", arrow::Type::INT8}, + {"-20", arrow::Type::INT8}, + {"1556", arrow::Type::INT16}, + {"-2556", arrow::Type::INT16}, + {"348489", arrow::Type::INT32}, + {"-448489", arrow::Type::INT32}, + {"279039", arrow::Type::INT64}, + {"1234567", arrow::Type::INT64}, + {"0.334", arrow::Type::FLOAT}, + {"467.66472", arrow::Type::DOUBLE}, + {"abcde", arrow::Type::STRING}, + {"这是一个很长很长的中文", arrow::Type::STRING}, + {"1998-08-02", arrow::Type::DATE32}}; + + std::vector converters; + std::vector reconverters; + for (const auto& [value, type] : data) { + ASSERT_OK_AND_ASSIGN(auto converter, + DataConverterUtils::CreateDataToBinaryRowConverter(type, pool.get())); + converters.emplace_back(std::move(converter)); + ASSERT_OK_AND_ASSIGN(auto reconverter, + DataConverterUtils::CreateBinaryRowFieldToStringConverter( + type, /*legacy_partition_name_enabled=*/false)); + reconverters.emplace_back(reconverter); + } + BinaryRow row(data.size()); + BinaryRowWriter writer(&row, 0, pool.get()); + for (size_t idx = 0; idx < data.size(); idx++) { + ASSERT_OK(converters[idx](data[idx].first, idx, &writer)); + } + writer.Complete(); + + ASSERT_EQ(data.size(), row.GetFieldCount()); + ASSERT_EQ(true, row.GetBoolean(0)); + ASSERT_EQ(10, row.GetByte(1)); + ASSERT_EQ(-20, row.GetByte(2)); + ASSERT_EQ(1556, row.GetShort(3)); + ASSERT_EQ(-2556, row.GetShort(4)); + ASSERT_EQ(348489, row.GetInt(5)); + ASSERT_EQ(-448489, row.GetInt(6)); + ASSERT_EQ(279039, row.GetLong(7)); + ASSERT_EQ(1234567, row.GetLong(8)); + ASSERT_NEAR(0.334, row.GetFloat(9), 0.0000001); + ASSERT_NEAR(467.66472, row.GetDouble(10), 0.0000001); + ASSERT_EQ("abcde", row.GetString(11).ToString()); + ASSERT_EQ("这是一个很长很长的中文", row.GetString(12).ToString()); + ASSERT_EQ(10440, row.GetDate(13)); + + for (size_t idx = 0; idx < data.size(); idx++) { + ASSERT_OK_AND_ASSIGN(auto partition_field_str, reconverters[idx](row, idx)); + ASSERT_EQ(data[idx].first, partition_field_str); + } +} + +TEST(DataConverterUtilsTest, TestValueToStringSimple) { + ASSERT_EQ("233.0", DataConverterUtils::FloatValueToString(static_cast(233), 6)); + ASSERT_EQ("3.0E-4", + DataConverterUtils::FloatValueToString(static_cast(0.0003), 6)); + ASSERT_EQ("3.478589E10", + DataConverterUtils::FloatValueToString(static_cast(34785895352), 6)); + ASSERT_EQ("1.0E9", + DataConverterUtils::FloatValueToString(static_cast(1000000000), 6)); + ASSERT_EQ("1000000.0", + DataConverterUtils::FloatValueToString(static_cast(1000000), 6)); + ASSERT_EQ("467.6647", + DataConverterUtils::FloatValueToString(static_cast(467.6647), 6)); + + ASSERT_EQ("233.0", + DataConverterUtils::FloatValueToString(static_cast(233), 15)); + ASSERT_EQ("3.4785895352E10", + DataConverterUtils::FloatValueToString(static_cast(34785895352), 15)); + ASSERT_EQ("1.0E9", + DataConverterUtils::FloatValueToString(static_cast(1000000000), 15)); + ASSERT_EQ("1000000.0", + DataConverterUtils::FloatValueToString(static_cast(1000000), 15)); + ASSERT_EQ("467.66472", + DataConverterUtils::FloatValueToString(static_cast(467.66472), 6)); + ASSERT_EQ("123456.123456", DataConverterUtils::FloatValueToString( + static_cast(123456.123456), 6)); +} + +} // namespace paimon::test diff --git a/src/paimon/common/utils/decimal_utils.cpp b/src/paimon/common/utils/decimal_utils.cpp new file mode 100644 index 0000000..ff4aa1b --- /dev/null +++ b/src/paimon/common/utils/decimal_utils.cpp @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "paimon/common/utils/decimal_utils.h" + +#include +#include +#include + +#include "arrow/api.h" +#include "arrow/util/basic_decimal.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "fmt/format.h" + +namespace paimon { +Status DecimalUtils::CheckDecimalType(const arrow::DataType& type) { + auto* decimal_type = dynamic_cast(&type); + if (!decimal_type) { + return Status::Invalid(fmt::format("Invalid decimal type: {}", type.ToString())); + } + if (decimal_type->precision() > Decimal::MAX_PRECISION || + decimal_type->precision() < Decimal::MIN_PRECISION) { + return Status::Invalid(fmt::format("Invalid decimal type, precision must in range [{}, {}]", + Decimal::MIN_PRECISION, Decimal::MAX_PRECISION)); + } + if (decimal_type->precision() < decimal_type->scale()) { + return Status::Invalid(fmt::format("Invalid decimal type {}, precision must >= scale", + decimal_type->ToString())); + } + return Status::OK(); +} + +std::optional DecimalUtils::RescaleDecimalWithOverflowCheck( + const arrow::Decimal128& src_decimal, int32_t src_scale, int32_t target_precision, + int32_t target_scale) { + arrow::Decimal128 target_decimal = src_decimal; + if (src_scale < target_scale) { + int32_t delta_scale = target_scale - src_scale; + arrow::BasicDecimal128 min_bound = arrow::BasicDecimal128::GetMinSentinel(); + arrow::BasicDecimal128 max_bound = arrow::BasicDecimal128::GetMaxSentinel(); + min_bound /= arrow::BasicDecimal128::GetScaleMultiplier(delta_scale); + max_bound /= arrow::BasicDecimal128::GetScaleMultiplier(delta_scale); + // scaled_decimal may be overflow 128 bits + // noted that, arrow::Decimal.Rescale() is not safe + if (src_decimal > max_bound || src_decimal < min_bound) { + return std::nullopt; + } + target_decimal = src_decimal.IncreaseScaleBy(delta_scale); + } else if (src_scale > target_scale) { + target_decimal = src_decimal.ReduceScaleBy(src_scale - target_scale, /*round=*/true); + } + if (!target_decimal.FitsInPrecision(target_precision)) { + return std::nullopt; + } + return target_decimal; +} + +Result DecimalUtils::StrToInt128(const std::string& str) { + try { + Decimal::int128_t ret = 0; + size_t length = str.length(); + if (length > 0) { + bool is_negative = str[0] == '-'; + size_t posn = is_negative ? 1 : 0; + while (posn < length) { + size_t group = std::min(18ul, length - posn); + int64_t chunk = std::stoll(str.substr(posn, group)); + int64_t multiple = 1; + for (size_t i = 0; i < group; ++i) { + multiple *= 10; + } + ret *= multiple; + ret += chunk; + posn += group; + } + if (is_negative) { + ret = -ret; + } + } + return ret; + } catch (...) { + return Status::Invalid(fmt::format("invalid string: [{}], cannot convert to int128", str)); + } +} + +} // namespace paimon diff --git a/src/paimon/common/utils/decimal_utils.h b/src/paimon/common/utils/decimal_utils.h new file mode 100644 index 0000000..e8b10fe --- /dev/null +++ b/src/paimon/common/utils/decimal_utils.h @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include + +#include "arrow/api.h" +#include "arrow/util/decimal.h" +#include "fmt/format.h" +#include "paimon/data/decimal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace arrow { +class DataType; +} // namespace arrow + +namespace paimon { +class DecimalUtils { + public: + DecimalUtils() = delete; + ~DecimalUtils() = delete; + + static Status CheckDecimalType(const arrow::DataType& type); + + static std::optional RescaleDecimalWithOverflowCheck( + const arrow::Decimal128& src_decimal, int32_t src_scale, int32_t target_precision, + int32_t target_scale); + + static Result StrToInt128(const std::string& str); +}; + +} // namespace paimon diff --git a/src/paimon/common/utils/decimal_utils_test.cpp b/src/paimon/common/utils/decimal_utils_test.cpp new file mode 100644 index 0000000..821829a --- /dev/null +++ b/src/paimon/common/utils/decimal_utils_test.cpp @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "paimon/common/utils/decimal_utils.h" + +#include + +#include "arrow/api.h" +#include "gtest/gtest.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { + +TEST(DecimalUtilsTest, TestCheckDecimalType) { + ASSERT_NOK_WITH_MSG(DecimalUtils::CheckDecimalType(*arrow::int32()), "Invalid decimal type:"); + ASSERT_NOK_WITH_MSG(DecimalUtils::CheckDecimalType(*arrow::decimal128(20, 22)), + "precision must >= scale"); +} + +} // namespace paimon::test diff --git a/src/paimon/common/utils/field_type_utils.h b/src/paimon/common/utils/field_type_utils.h new file mode 100644 index 0000000..77da7db --- /dev/null +++ b/src/paimon/common/utils/field_type_utils.h @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include + +#include "arrow/api.h" +#include "arrow/type_fwd.h" +#include "fmt/format.h" +#include "paimon/defs.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { +class FieldTypeUtils { + public: + FieldTypeUtils() = delete; + ~FieldTypeUtils() = delete; + + static bool IsIntegerNumeric(const FieldType& type) { + if (type == FieldType::TINYINT || type == FieldType::SMALLINT || type == FieldType::INT || + type == FieldType::BIGINT) { + return true; + } + return false; + } + static bool IntegerScaleLargerThan(const FieldType& type, const FieldType& other_type) { + return (type == FieldType::SMALLINT && other_type == FieldType::TINYINT) || + (type == FieldType::INT && other_type != FieldType::BIGINT) || + (type == FieldType::BIGINT); + } + + static Result ConvertToFieldType(const arrow::Type::type& arrow_type) { + switch (arrow_type) { + case arrow::Type::type::BOOL: + return FieldType::BOOLEAN; + case arrow::Type::type::INT8: + return FieldType::TINYINT; + case arrow::Type::type::INT16: + return FieldType::SMALLINT; + case arrow::Type::type::INT32: + return FieldType::INT; + case arrow::Type::type::INT64: + return FieldType::BIGINT; + case arrow::Type::type::FLOAT: + return FieldType::FLOAT; + case arrow::Type::type::DOUBLE: + return FieldType::DOUBLE; + case arrow::Type::type::STRING: + return FieldType::STRING; + case arrow::Type::type::BINARY: + return FieldType::BINARY; + case arrow::Type::type::LARGE_BINARY: + return FieldType::BLOB; // TODO(xinyu): binary to large binary? + case arrow::Type::type::TIMESTAMP: + return FieldType::TIMESTAMP; + case arrow::Type::type::DECIMAL128: + return FieldType::DECIMAL; + case arrow::Type::type::DATE32: + return FieldType::DATE; + case arrow::Type::type::LIST: + return FieldType::ARRAY; + case arrow::Type::type::MAP: + return FieldType::MAP; + case arrow::Type::type::STRUCT: + return FieldType::STRUCT; + default: + return Status::Invalid( + fmt::format("Not support arrow type {}", static_cast(arrow_type))); + } + } + + static std::string FieldTypeToString(const FieldType& type) { + switch (type) { + case FieldType::BOOLEAN: + return "BOOLEAN"; + case FieldType::TINYINT: + return "TINYINT"; + case FieldType::SMALLINT: + return "SMALLINT"; + case FieldType::INT: + return "INT"; + case FieldType::BIGINT: + return "BIGINT"; + case FieldType::FLOAT: + return "FLOAT"; + case FieldType::DOUBLE: + return "DOUBLE"; + case FieldType::STRING: + return "STRING"; + case FieldType::BINARY: + return "BINARY"; + case FieldType::BLOB: + return "BLOB"; + case FieldType::TIMESTAMP: + return "TIMESTAMP"; + case FieldType::DECIMAL: + return "DECIMAL"; + case FieldType::DATE: + return "DATE"; + case FieldType::ARRAY: + return "ARRAY"; + case FieldType::MAP: + return "MAP"; + case FieldType::STRUCT: + return "STRUCT"; + default: + return "UNKNOWN, type id:" + std::to_string(static_cast(type)); + } + } +}; +} // namespace paimon diff --git a/src/paimon/common/utils/field_type_utils_test.cpp b/src/paimon/common/utils/field_type_utils_test.cpp new file mode 100644 index 0000000..7c2b237 --- /dev/null +++ b/src/paimon/common/utils/field_type_utils_test.cpp @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "paimon/common/utils/field_type_utils.h" + +#include "arrow/type_fwd.h" +#include "gtest/gtest.h" +#include "paimon/defs.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { + +// Test case: Check if Integer Numeric types are correctly identified +TEST(FieldTypeUtilsTest, IsIntegerNumeric) { + ASSERT_TRUE(FieldTypeUtils::IsIntegerNumeric(FieldType::TINYINT)); + ASSERT_TRUE(FieldTypeUtils::IsIntegerNumeric(FieldType::SMALLINT)); + ASSERT_TRUE(FieldTypeUtils::IsIntegerNumeric(FieldType::INT)); + ASSERT_TRUE(FieldTypeUtils::IsIntegerNumeric(FieldType::BIGINT)); + + // Non-integer types should return false + ASSERT_FALSE(FieldTypeUtils::IsIntegerNumeric(FieldType::FLOAT)); + ASSERT_FALSE(FieldTypeUtils::IsIntegerNumeric(FieldType::STRING)); + ASSERT_FALSE(FieldTypeUtils::IsIntegerNumeric(FieldType::TIMESTAMP)); + ASSERT_FALSE(FieldTypeUtils::IsIntegerNumeric(FieldType::DECIMAL)); +} + +// Test case: Check IntegerScaleLargerThan function +TEST(FieldTypeUtilsTest, IntegerScaleLargerThan) { + ASSERT_TRUE(FieldTypeUtils::IntegerScaleLargerThan(FieldType::SMALLINT, FieldType::TINYINT)); + ASSERT_TRUE(FieldTypeUtils::IntegerScaleLargerThan(FieldType::INT, FieldType::SMALLINT)); + ASSERT_TRUE(FieldTypeUtils::IntegerScaleLargerThan(FieldType::BIGINT, FieldType::INT)); + ASSERT_TRUE(FieldTypeUtils::IntegerScaleLargerThan(FieldType::BIGINT, FieldType::SMALLINT)); + + // Should return false for other cases + ASSERT_FALSE(FieldTypeUtils::IntegerScaleLargerThan(FieldType::TINYINT, FieldType::SMALLINT)); + ASSERT_FALSE(FieldTypeUtils::IntegerScaleLargerThan(FieldType::SMALLINT, FieldType::INT)); + ASSERT_FALSE(FieldTypeUtils::IntegerScaleLargerThan(FieldType::INT, FieldType::BIGINT)); +} + +// Test case: Check ConvertToFieldType with various Arrow types +TEST(FieldTypeUtilsTest, ConvertToFieldType) { + // Test valid Arrow types and their conversions + ASSERT_OK_AND_ASSIGN(auto result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::BOOL)); + ASSERT_EQ(result, FieldType::BOOLEAN); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::INT8)); + ASSERT_EQ(result, FieldType::TINYINT); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::INT16)); + ASSERT_EQ(result, FieldType::SMALLINT); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::INT32)); + ASSERT_EQ(result, FieldType::INT); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::INT64)); + ASSERT_EQ(result, FieldType::BIGINT); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::FLOAT)); + ASSERT_EQ(result, FieldType::FLOAT); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::DOUBLE)); + ASSERT_EQ(result, FieldType::DOUBLE); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::STRING)); + ASSERT_EQ(result, FieldType::STRING); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::BINARY)); + ASSERT_EQ(result, FieldType::BINARY); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::TIMESTAMP)); + ASSERT_EQ(result, FieldType::TIMESTAMP); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::DECIMAL128)); + ASSERT_EQ(result, FieldType::DECIMAL); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::DATE32)); + ASSERT_EQ(result, FieldType::DATE); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::LIST)); + ASSERT_EQ(result, FieldType::ARRAY); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::MAP)); + ASSERT_EQ(result, FieldType::MAP); + + ASSERT_OK_AND_ASSIGN(result, FieldTypeUtils::ConvertToFieldType(arrow::Type::type::STRUCT)); + ASSERT_EQ(result, FieldType::STRUCT); + + // Test unsupported Arrow type + ASSERT_NOK(FieldTypeUtils::ConvertToFieldType( + static_cast(999))); // Invalid Arrow type +} + +// Test case: FieldTypeToString +TEST(FieldTypeUtilsTest, FieldTypeToString) { + // Check string representation of different FieldTypes + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::BOOLEAN), "BOOLEAN"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::TINYINT), "TINYINT"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::SMALLINT), "SMALLINT"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::INT), "INT"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::BIGINT), "BIGINT"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::FLOAT), "FLOAT"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::DOUBLE), "DOUBLE"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::STRING), "STRING"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::BINARY), "BINARY"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::BLOB), "BLOB"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::TIMESTAMP), "TIMESTAMP"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::DECIMAL), "DECIMAL"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::DATE), "DATE"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::ARRAY), "ARRAY"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::MAP), "MAP"); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(FieldType::STRUCT), "STRUCT"); + + // Test UNKNOWN type + auto unknown_type = static_cast(999); + ASSERT_EQ(FieldTypeUtils::FieldTypeToString(unknown_type), "UNKNOWN, type id:999"); +} + +} // namespace paimon::test diff --git a/src/paimon/common/utils/file_type.cpp b/src/paimon/common/utils/file_type.cpp new file mode 100644 index 0000000..6554e44 --- /dev/null +++ b/src/paimon/common/utils/file_type.cpp @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "paimon/common/utils/file_type.h" + +#include "paimon/common/utils/path_util.h" +#include "paimon/common/utils/string_utils.h" + +namespace paimon { + +namespace { + +constexpr char SNAPSHOT_PREFIX[] = "snapshot-"; +constexpr char SCHEMA_PREFIX[] = "schema-"; +constexpr char STATISTICS_PREFIX[] = "stat-"; +constexpr char TAG_PREFIX[] = "tag-"; +constexpr char CONSUMER_PREFIX[] = "consumer-"; +constexpr char SERVICE_PREFIX[] = "service-"; +constexpr char INDEX_PATH_SUFFIX[] = ".index"; +constexpr char INDEX_PREFIX[] = "index-"; +constexpr char CHANGELOG_PREFIX[] = "changelog-"; + +constexpr char MANIFEST[] = "manifest"; +constexpr char CHANGELOG_DIR[] = "changelog"; +constexpr char GLOBAL_INDEX_INFIX[] = "global-index-"; +constexpr char TEMP_FILE_SUFFIX[] = ".tmp"; + +std::string UnwrapTempFileName(const std::string& name) { + // format: .{originalName}.{UUID}.tmp + // suffix ".{UUID}.tmp" is 41 chars: 1(dot) + 36(UUID) + 4(.tmp) + // minimum total: 1(leading dot) + 1(name) + 41(suffix) = 43 + if (name.size() < 43 || name[0] != '.' || !StringUtils::EndsWith(name, TEMP_FILE_SUFFIX)) { + return name; + } + + size_t dot_before_uuid = name.size() - 41; + if (name[dot_before_uuid] != '.') { + return name; + } + + return name.substr(1, dot_before_uuid - 1); +} + +} // namespace + +bool FileTypeUtils::IsIndex(FileType file_type) { + return file_type == FileType::kBucketIndex || file_type == FileType::kGlobalIndex || + file_type == FileType::kFileIndex; +} + +std::string FileTypeUtils::ToString(FileType file_type) { + switch (file_type) { + case FileType::kMeta: + return "meta"; + case FileType::kData: + return "data"; + case FileType::kBucketIndex: + return "bucket_index"; + case FileType::kGlobalIndex: + return "global_index"; + case FileType::kFileIndex: + return "file_index"; + } + return "data"; +} + +FileType FileTypeUtils::Classify(const std::string& file_path) { + std::string name = PathUtil::GetName(file_path); + name = UnwrapTempFileName(name); + + if (StringUtils::StartsWith(name, SNAPSHOT_PREFIX) || + StringUtils::StartsWith(name, SCHEMA_PREFIX) || + StringUtils::StartsWith(name, STATISTICS_PREFIX) || + StringUtils::StartsWith(name, TAG_PREFIX) || + StringUtils::StartsWith(name, CONSUMER_PREFIX) || + StringUtils::StartsWith(name, SERVICE_PREFIX)) { + return FileType::kMeta; + } + + if (StringUtils::EndsWith(name, INDEX_PATH_SUFFIX)) { + if (name.find(GLOBAL_INDEX_INFIX) != std::string::npos) { + return FileType::kGlobalIndex; + } + return FileType::kFileIndex; + } + + if (name.find(MANIFEST) != std::string::npos) { + return FileType::kMeta; + } + + if (StringUtils::StartsWith(name, INDEX_PREFIX)) { + return FileType::kBucketIndex; + } + + if (name == "EARLIEST" || name == "LATEST") { + return FileType::kMeta; + } + + if (StringUtils::EndsWith(name, "_SUCCESS")) { + return FileType::kMeta; + } + + const std::string parent = PathUtil::GetParentDirPath(file_path); + if (StringUtils::StartsWith(name, CHANGELOG_PREFIX) && + PathUtil::GetName(parent) == CHANGELOG_DIR) { + return FileType::kMeta; + } + + return FileType::kData; +} + +} // namespace paimon diff --git a/src/paimon/common/utils/file_type.h b/src/paimon/common/utils/file_type.h new file mode 100644 index 0000000..7c68baf --- /dev/null +++ b/src/paimon/common/utils/file_type.h @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include "paimon/visibility.h" + +namespace paimon { + +enum class FileType { + kMeta, + kData, + kBucketIndex, + kGlobalIndex, + kFileIndex, +}; + +class PAIMON_EXPORT FileTypeUtils { + public: + static bool IsIndex(FileType file_type); + static FileType Classify(const std::string& file_path); + static std::string ToString(FileType file_type); +}; + +} // namespace paimon diff --git a/src/paimon/common/utils/file_type_test.cpp b/src/paimon/common/utils/file_type_test.cpp new file mode 100644 index 0000000..a358b7a --- /dev/null +++ b/src/paimon/common/utils/file_type_test.cpp @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "paimon/common/utils/file_type.h" + +#include "gtest/gtest.h" + +namespace paimon::test { + +TEST(FileTypeTest, TestIsIndex) { + ASSERT_TRUE(FileTypeUtils::IsIndex(FileType::kBucketIndex)); + ASSERT_TRUE(FileTypeUtils::IsIndex(FileType::kGlobalIndex)); + ASSERT_TRUE(FileTypeUtils::IsIndex(FileType::kFileIndex)); + ASSERT_FALSE(FileTypeUtils::IsIndex(FileType::kMeta)); + ASSERT_FALSE(FileTypeUtils::IsIndex(FileType::kData)); +} + +TEST(FileTypeTest, TestMetaPrefix) { + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/snapshot/snapshot-1"), FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/schema/schema-2"), FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/statistics/stat-a1b2c3d4-0"), + FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/tag/tag-3"), FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/consumer/consumer-myGroup"), + FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/service/service-4"), FileType::kMeta); +} + +TEST(FileTypeTest, TestIndexTypes) { + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/index/bitmap.index"), FileType::kFileIndex); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/index/btree-global-index-a1b2.index"), + FileType::kGlobalIndex); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/index/bitmap-global-index-1.index"), + FileType::kGlobalIndex); + ASSERT_EQ( + FileTypeUtils::Classify("dfs://cluster/db/index/lumina-vector-ann-global-index-a1b2.index"), + FileType::kGlobalIndex); + ASSERT_EQ( + FileTypeUtils::Classify("dfs://cluster/db/index/tantivy-fulltext-global-index-a1b2.index"), + FileType::kGlobalIndex); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/p=1/bucket-0/index-abcdef-1"), + FileType::kBucketIndex); +} + +TEST(FileTypeTest, TestMetaSpecialNames) { + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/manifest/manifest-a1b2c3d4-0"), + FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/manifest/manifest-list-1"), + FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/manifest/index-manifest-a1b2c3d4-0"), + FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/_SUCCESS"), FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/part-1_SUCCESS"), FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/hint/EARLIEST"), FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/hint/LATEST"), FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/changelog/changelog-123"), FileType::kMeta); +} + +TEST(FileTypeTest, TestDefaultData) { + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/p=1/bucket-0/data-1.orc"), FileType::kData); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/p=1/bucket-0/data-1.parquet"), + FileType::kData); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/p=1/bucket-0/changelog-a1b2c3d4-0.orc"), + FileType::kData); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/p=1/bucket-0/data-a1b2c3d4-0.blob"), + FileType::kData); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/p=1/bucket-0/data-a1b2c3d4-0.vector.lance"), + FileType::kData); + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/random/unknown.bin"), FileType::kData); +} + +TEST(FileTypeTest, TestChangelogInAncestorPathNotMisclassified) { + const std::string root = "hdfs://cluster/changelog/warehouse/db.db/table"; + ASSERT_EQ(FileTypeUtils::Classify(root + "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc"), + FileType::kData); + ASSERT_EQ(FileTypeUtils::Classify(root + "/dt=2024-01-01/bucket-0/changelog-a1b2c3d4-0.orc"), + FileType::kData); +} + +TEST(FileTypeTest, TestBranchPaths) { + const std::string branch_root = "hdfs://cluster/warehouse/db.db/table/branch/branch-dev"; + ASSERT_EQ(FileTypeUtils::Classify(branch_root + "/snapshot/snapshot-1"), FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify(branch_root + "/schema/schema-0"), FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify(branch_root + "/changelog/changelog-1"), FileType::kMeta); + ASSERT_EQ(FileTypeUtils::Classify(branch_root + "/index/index-a1b2c3d4-0"), + FileType::kBucketIndex); + ASSERT_EQ(FileTypeUtils::Classify(branch_root + "/index/btree-global-index-a1b2c3d4.index"), + FileType::kGlobalIndex); +} + +TEST(FileTypeTest, TestTempWrappedName) { + ASSERT_EQ(FileTypeUtils::Classify( + "dfs://cluster/db/snapshot/.snapshot-1.12345678-1234-1234-1234-123456789abc.tmp"), + FileType::kMeta); + + ASSERT_EQ(FileTypeUtils::Classify( + "dfs://cluster/db/schema/.schema-0.12345678-1234-1234-1234-123456789abc.tmp"), + FileType::kMeta); + + ASSERT_EQ(FileTypeUtils::Classify( + "dfs://cluster/db/tag/.tag-myTag.12345678-1234-1234-1234-123456789abc.tmp"), + FileType::kMeta); + + ASSERT_EQ(FileTypeUtils::Classify( + "dfs://cluster/db/consumer/.consumer-myGroup.12345678-1234-1234-1234-" + "123456789abc.tmp"), + FileType::kMeta); + + ASSERT_EQ(FileTypeUtils::Classify( + "dfs://cluster/db/service/.service-primary-key-lookup.12345678-1234-1234-1234-" + "123456789abc.tmp"), + FileType::kMeta); + + ASSERT_EQ(FileTypeUtils::Classify( + "dfs://cluster/db/snapshot/.EARLIEST.12345678-1234-1234-1234-123456789abc.tmp"), + FileType::kMeta); + + ASSERT_EQ(FileTypeUtils::Classify( + "dfs://cluster/db/snapshot/.LATEST.12345678-1234-1234-1234-123456789abc.tmp"), + FileType::kMeta); + + ASSERT_EQ( + FileTypeUtils::Classify( + "dfs://cluster/db/p=1/bucket-0/._SUCCESS.12345678-1234-1234-1234-123456789abc.tmp"), + FileType::kMeta); + + ASSERT_EQ( + FileTypeUtils::Classify( + "dfs://cluster/db/changelog/.changelog-1.12345678-1234-1234-1234-123456789abc.tmp"), + FileType::kMeta); + + ASSERT_EQ(FileTypeUtils::Classify( + "dfs://cluster/db/statistics/.stat-a1b2c3d4-0.12345678-1234-1234-1234-" + "123456789abc.tmp"), + FileType::kMeta); + + ASSERT_EQ( + FileTypeUtils::Classify( + "dfs://cluster/db/p=1/bucket-0/.data-1.orc.12345678-1234-1234-1234-123456789abc.tmp"), + FileType::kData); + + ASSERT_EQ(FileTypeUtils::Classify( + "dfs://cluster/db/index/.bitmap.index.12345678-1234-1234-1234-123456789abc.tmp"), + FileType::kFileIndex); + + ASSERT_EQ(FileTypeUtils::Classify( + "dfs://cluster/db/index/.bitmap-global-index-1.index.12345678-1234-1234-1234-" + "123456789abc.tmp"), + FileType::kGlobalIndex); +} + +TEST(FileTypeTest, TestInvalidTempWrapperFallsBackToOriginalName) { + // No leading dot -> should not unwrap. + ASSERT_EQ(FileTypeUtils::Classify( + "dfs://cluster/db/snapshot/snapshot-1.12345678-1234-1234-1234-123456789abc.tmp"), + FileType::kMeta); + + // No .tmp suffix -> should not unwrap. + ASSERT_EQ(FileTypeUtils::Classify( + "dfs://cluster/db/snapshot/.snapshot-1.12345678-1234-1234-1234-123456789abc"), + FileType::kData); + + // Too short -> should not unwrap. + ASSERT_EQ(FileTypeUtils::Classify("dfs://cluster/db/snapshot/.x.tmp"), FileType::kData); +} + +} // namespace paimon::test From b1d5c3498d7939351c64eb84b45279ac2285b3aa Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Thu, 4 Jun 2026 02:53:24 +0000 Subject: [PATCH 2/2] fix comment --- .../common/utils/data_converter_utils.h | 65 ------------------- .../utils/data_converter_utils_test.cpp | 47 ++------------ 2 files changed, 6 insertions(+), 106 deletions(-) diff --git a/src/paimon/common/utils/data_converter_utils.h b/src/paimon/common/utils/data_converter_utils.h index 7328988..3ffeb98 100644 --- a/src/paimon/common/utils/data_converter_utils.h +++ b/src/paimon/common/utils/data_converter_utils.h @@ -116,26 +116,6 @@ class DataConverterUtils { return Status::OK(); }; break; - case arrow::Type::FLOAT: - converter = [](const std::string& value_str, int32_t field_idx, - BinaryRowWriter* writer) { - auto value = StringUtils::StringToValue(value_str); - RETURN_INVALID_WITH_FIELD_INFO(value, field_idx, value_str, - arrow::internal::ToString(arrow::Type::FLOAT)); - writer->WriteFloat(field_idx, value.value()); - return Status::OK(); - }; - break; - case arrow::Type::DOUBLE: - converter = [](const std::string& value_str, int32_t field_idx, - BinaryRowWriter* writer) { - auto value = StringUtils::StringToValue(value_str); - RETURN_INVALID_WITH_FIELD_INFO(value, field_idx, value_str, - arrow::internal::ToString(arrow::Type::DOUBLE)); - writer->WriteDouble(field_idx, value.value()); - return Status::OK(); - }; - break; case arrow::Type::STRING: converter = [pool](const std::string& value_str, int32_t field_idx, BinaryRowWriter* writer) { @@ -161,39 +141,6 @@ class DataConverterUtils { return converter; } - // support float and double - template - static std::string FloatValueToString(const T& value, int32_t precision) { - std::stringstream oss; - if (value >= 1e-3 && value <= 1e7) { - oss << std::fixed << std::setprecision(sizeof(T)) << value; - std::string result = oss.str(); - auto pos = result.find_last_not_of('0'); - result.erase(pos + (result[pos] == '.') + 1, std::string::npos); - return result; - } - oss << std::uppercase << std::scientific << std::setprecision(precision) << value; - std::string result = oss.str(); - auto e_pos = result.find('E'); - if (e_pos != std::string::npos) { - if (result[e_pos + 1] == '+') { - result.erase(e_pos + 1, 1 + (result[e_pos + 2] == '0')); - } else { - if (result[e_pos + 1] == '-' && result[e_pos + 2] == '0') { - result.erase(e_pos + 2, 1); - } - } - auto zero_pos = e_pos - 1; - while (zero_pos >= 1 && result[zero_pos] == '0' && result[zero_pos - 1] != '.') { - zero_pos--; - } - if (e_pos - zero_pos - 1 > 0) { - result.erase(zero_pos + 1, e_pos - zero_pos - 1); - } - } - return result; - } - static Result CreateBinaryRowFieldToStringConverter( arrow::Type::type type, bool legacy_partition_name_enabled) { BinaryRowFieldToStrConverter converter; @@ -229,18 +176,6 @@ class DataConverterUtils { return std::to_string(data); }; break; - case arrow::Type::FLOAT: - converter = [](const BinaryRow& row, int32_t field_idx) { - float data = row.GetFloat(field_idx); - return FloatValueToString(data, 6); - }; - break; - case arrow::Type::DOUBLE: - converter = [](const BinaryRow& row, int32_t field_idx) { - double data = row.GetDouble(field_idx); - return FloatValueToString(data, 15); - }; - break; case arrow::Type::STRING: converter = [](const BinaryRow& row, int32_t field_idx) { BinaryString data = row.GetString(field_idx); diff --git a/src/paimon/common/utils/data_converter_utils_test.cpp b/src/paimon/common/utils/data_converter_utils_test.cpp index d7dbae7..b007855 100644 --- a/src/paimon/common/utils/data_converter_utils_test.cpp +++ b/src/paimon/common/utils/data_converter_utils_test.cpp @@ -41,8 +41,6 @@ TEST(DataConverterUtilsTest, TestDataToBinaryRowConverterWithLegacyPartitionName {"-448489", arrow::Type::INT32}, {"279039", arrow::Type::INT64}, {"1234567", arrow::Type::INT64}, - {"0.334", arrow::Type::FLOAT}, - {"467.66472", arrow::Type::DOUBLE}, {"abcde", arrow::Type::STRING}, {"这是一个很长很长的中文", arrow::Type::STRING}, {"10440", arrow::Type::DATE32}}; @@ -80,11 +78,9 @@ TEST(DataConverterUtilsTest, TestDataToBinaryRowConverterWithLegacyPartitionName ASSERT_EQ(-448489, row.GetInt(6)); ASSERT_EQ(279039, row.GetLong(7)); ASSERT_EQ(1234567, row.GetLong(8)); - ASSERT_NEAR(0.334, row.GetFloat(9), 0.0000001); - ASSERT_NEAR(467.66472, row.GetDouble(10), 0.0000001); - ASSERT_EQ("abcde", row.GetString(11).ToString()); - ASSERT_EQ("这是一个很长很长的中文", row.GetString(12).ToString()); - ASSERT_EQ(10440, row.GetDate(13)); + ASSERT_EQ("abcde", row.GetString(9).ToString()); + ASSERT_EQ("这是一个很长很长的中文", row.GetString(10).ToString()); + ASSERT_EQ(10440, row.GetDate(11)); for (size_t idx = 0; idx < data.size(); idx++) { ASSERT_OK_AND_ASSIGN(auto partition_field_str, reconverters[idx](row, idx)); @@ -104,8 +100,6 @@ TEST(DataConverterUtilsTest, TestDataToBinaryRowConverterWithNoLegacyPartitionNa {"-448489", arrow::Type::INT32}, {"279039", arrow::Type::INT64}, {"1234567", arrow::Type::INT64}, - {"0.334", arrow::Type::FLOAT}, - {"467.66472", arrow::Type::DOUBLE}, {"abcde", arrow::Type::STRING}, {"这是一个很长很长的中文", arrow::Type::STRING}, {"1998-08-02", arrow::Type::DATE32}}; @@ -138,11 +132,9 @@ TEST(DataConverterUtilsTest, TestDataToBinaryRowConverterWithNoLegacyPartitionNa ASSERT_EQ(-448489, row.GetInt(6)); ASSERT_EQ(279039, row.GetLong(7)); ASSERT_EQ(1234567, row.GetLong(8)); - ASSERT_NEAR(0.334, row.GetFloat(9), 0.0000001); - ASSERT_NEAR(467.66472, row.GetDouble(10), 0.0000001); - ASSERT_EQ("abcde", row.GetString(11).ToString()); - ASSERT_EQ("这是一个很长很长的中文", row.GetString(12).ToString()); - ASSERT_EQ(10440, row.GetDate(13)); + ASSERT_EQ("abcde", row.GetString(9).ToString()); + ASSERT_EQ("这是一个很长很长的中文", row.GetString(10).ToString()); + ASSERT_EQ(10440, row.GetDate(11)); for (size_t idx = 0; idx < data.size(); idx++) { ASSERT_OK_AND_ASSIGN(auto partition_field_str, reconverters[idx](row, idx)); @@ -150,31 +142,4 @@ TEST(DataConverterUtilsTest, TestDataToBinaryRowConverterWithNoLegacyPartitionNa } } -TEST(DataConverterUtilsTest, TestValueToStringSimple) { - ASSERT_EQ("233.0", DataConverterUtils::FloatValueToString(static_cast(233), 6)); - ASSERT_EQ("3.0E-4", - DataConverterUtils::FloatValueToString(static_cast(0.0003), 6)); - ASSERT_EQ("3.478589E10", - DataConverterUtils::FloatValueToString(static_cast(34785895352), 6)); - ASSERT_EQ("1.0E9", - DataConverterUtils::FloatValueToString(static_cast(1000000000), 6)); - ASSERT_EQ("1000000.0", - DataConverterUtils::FloatValueToString(static_cast(1000000), 6)); - ASSERT_EQ("467.6647", - DataConverterUtils::FloatValueToString(static_cast(467.6647), 6)); - - ASSERT_EQ("233.0", - DataConverterUtils::FloatValueToString(static_cast(233), 15)); - ASSERT_EQ("3.4785895352E10", - DataConverterUtils::FloatValueToString(static_cast(34785895352), 15)); - ASSERT_EQ("1.0E9", - DataConverterUtils::FloatValueToString(static_cast(1000000000), 15)); - ASSERT_EQ("1000000.0", - DataConverterUtils::FloatValueToString(static_cast(1000000), 15)); - ASSERT_EQ("467.66472", - DataConverterUtils::FloatValueToString(static_cast(467.66472), 6)); - ASSERT_EQ("123456.123456", DataConverterUtils::FloatValueToString( - static_cast(123456.123456), 6)); -} - } // namespace paimon::test