Skip to content

Commit db86467

Browse files
Enmkarthurpassos
authored andcommitted
Merge pull request #1039 from Altinity/fp_antaya_25_8_parquet_metadata_caching
Antalya 25.8 - Forward port of #938 - Parquet metadata caching
1 parent 65af401 commit db86467

23 files changed

+334
-16
lines changed

programs/server/Server.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,10 @@
163163
# include <azure/core/diagnostics/logger.hpp>
164164
#endif
165165

166+
#if USE_PARQUET
167+
# include <Processors/Formats/Impl/ParquetFileMetaDataCache.h>
168+
#endif
169+
166170

167171
/// A minimal file used when the server is run without installation
168172
constexpr unsigned char resource_embedded_xml[] =
@@ -369,6 +373,7 @@ namespace ServerSetting
369373
extern const ServerSettingsBool abort_on_logical_error;
370374
extern const ServerSettingsUInt64 jemalloc_flush_profile_interval_bytes;
371375
extern const ServerSettingsBool jemalloc_flush_profile_on_memory_exceeded;
376+
<<<<<<< HEAD
372377
extern const ServerSettingsString allowed_disks_for_table_engines;
373378
extern const ServerSettingsUInt64 s3_credentials_provider_max_cache_size;
374379
extern const ServerSettingsUInt64 max_open_files;
@@ -415,6 +420,9 @@ namespace ServerSetting
415420
extern const ServerSettingsUInt64 keeper_server_socket_send_timeout_sec;
416421
extern const ServerSettingsString hdfs_libhdfs3_conf;
417422
extern const ServerSettingsString config_file;
423+
=======
424+
extern const ServerSettingsUInt64 input_format_parquet_metadata_cache_max_size;
425+
>>>>>>> 67a38d1181c (Merge pull request #1039 from Altinity/fp_antaya_25_8_parquet_metadata_caching)
418426
}
419427

420428
namespace ErrorCodes
@@ -2739,6 +2747,10 @@ try
27392747

27402748
auto replicas_reconnector = ReplicasReconnector::init(global_context);
27412749

2750+
#if USE_PARQUET
2751+
ParquetFileMetaDataCache::instance()->setMaxSizeInBytes(server_settings[ServerSetting::input_format_parquet_metadata_cache_max_size]);
2752+
#endif
2753+
27422754
/// Set current database name before loading tables and databases because
27432755
/// system logs may copy global context.
27442756
std::string default_database = server_settings[ServerSetting::default_database];

src/Access/Common/AccessType.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,10 +334,10 @@ enum class AccessType : uint8_t
334334
M(SYSTEM_DROP_SCHEMA_CACHE, "SYSTEM CLEAR SCHEMA CACHE, SYSTEM DROP SCHEMA CACHE, DROP SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
335335
M(SYSTEM_DROP_FORMAT_SCHEMA_CACHE, "SYSTEM CLEAR FORMAT SCHEMA CACHE, SYSTEM DROP FORMAT SCHEMA CACHE, DROP FORMAT SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
336336
M(SYSTEM_DROP_S3_CLIENT_CACHE, "SYSTEM CLEAR S3 CLIENT CACHE, SYSTEM DROP S3 CLIENT, DROP S3 CLIENT CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
337+
M(SYSTEM_DROP_PARQUET_METADATA_CACHE, "SYSTEM DROP PARQUET METADATA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
337338
M(SYSTEM_DROP_CACHE, "DROP CACHE", GROUP, SYSTEM) \
338339
M(SYSTEM_RELOAD_CONFIG, "RELOAD CONFIG", GLOBAL, SYSTEM_RELOAD) \
339340
M(SYSTEM_RELOAD_USERS, "RELOAD USERS", GLOBAL, SYSTEM_RELOAD) \
340-
M(SYSTEM_RELOAD_DICTIONARY, "SYSTEM RELOAD DICTIONARIES, RELOAD DICTIONARY, RELOAD DICTIONARIES", GLOBAL, SYSTEM_RELOAD) \
341341
M(SYSTEM_RELOAD_MODEL, "SYSTEM RELOAD MODELS, RELOAD MODEL, RELOAD MODELS", GLOBAL, SYSTEM_RELOAD) \
342342
M(SYSTEM_RELOAD_FUNCTION, "SYSTEM RELOAD FUNCTIONS, RELOAD FUNCTION, RELOAD FUNCTIONS", GLOBAL, SYSTEM_RELOAD) \
343343
M(SYSTEM_RELOAD_EMBEDDED_DICTIONARIES, "RELOAD EMBEDDED DICTIONARIES", GLOBAL, SYSTEM_RELOAD) /* implicitly enabled by the grant SYSTEM_RELOAD_DICTIONARY ON *.* */\

src/Common/ProfileEvents.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1298,7 +1298,8 @@ The server successfully detected this situation and will download merged part fr
12981298
M(RuntimeFilterRowsChecked, "Number of rows checked by JOIN Runtime Filters", ValueType::Number) \
12991299
M(RuntimeFilterRowsPassed, "Number of rows that passed (not filtered out by) JOIN Runtime Filters", ValueType::Number) \
13001300
M(RuntimeFilterRowsSkipped, "Number of rows in blocks that were skipped by JOIN Runtime Filters", ValueType::Number) \
1301-
1301+
M(ParquetMetaDataCacheHits, "Number of times the read from filesystem cache hit the cache.", ValueType::Number) \
1302+
M(ParquetMetaDataCacheMisses, "Number of times the read from filesystem cache miss the cache.", ValueType::Number) \
13021303

13031304
#ifdef APPLY_FOR_EXTERNAL_EVENTS
13041305
#define APPLY_FOR_EVENTS(M) APPLY_FOR_BUILTIN_EVENTS(M) APPLY_FOR_EXTERNAL_EVENTS(M)

src/Core/FormatFactorySettings.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1527,8 +1527,7 @@ Allow to write information about geo columns in parquet metadata and encode colu
15271527
DECLARE(Bool, into_outfile_create_parent_directories, false, R"(
15281528
Automatically create parent directories when using INTO OUTFILE if they do not already exists.
15291529
)", 0) \
1530-
1531-
1530+
DECLARE(Bool, input_format_parquet_use_metadata_cache, true, R"(Enable parquet file metadata caching)", 0) \
15321531
// End of FORMAT_FACTORY_SETTINGS
15331532

15341533
#define OBSOLETE_FORMAT_SETTINGS(M, ALIAS) \

src/Core/ServerSettings.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1542,7 +1542,8 @@ The policy on how to perform a scheduling of CPU slots specified by `concurrent_
15421542
DECLARE(UInt64, keeper_server_socket_receive_timeout_sec, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, R"(Keeper socket receive timeout.)", 0, "keeper_server.socket_receive_timeout_sec") \
15431543
DECLARE(UInt64, keeper_server_socket_send_timeout_sec, DBMS_DEFAULT_SEND_TIMEOUT_SEC, R"(Keeper socket send timeout.)", 0, "keeper_server.socket_send_timeout_sec") \
15441544
DECLARE(String, hdfs_libhdfs3_conf, "", R"(Points libhdfs3 to the right location for its config.)", 0, "hdfs.libhdfs3_conf") \
1545-
DECLARE(String, config_file, "config.xml", R"(Points to the server config file.)", 0, "config-file")
1545+
DECLARE(String, config_file, "config.xml", R"(Points to the server config file.)", 0, "config-file") \
1546+
DECLARE(UInt64, input_format_parquet_metadata_cache_max_size, 500000000, "Maximum size of parquet file metadata cache", 0)
15461547

15471548
// clang-format on
15481549

src/Core/SettingsChangesHistory.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,11 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
434434
{"parallel_hash_join_threshold", 0, 0, "New setting"},
435435
/// Release closed. Please use 25.4
436436
});
437+
addSettingsChanges(settings_changes_history, "24.12.2.20000",
438+
{
439+
// Altinity Antalya modifications atop of 24.12
440+
{"input_format_parquet_use_metadata_cache", true, true, "New setting, turned ON by default"}, // https://github.com/Altinity/ClickHouse/pull/586
441+
});
437442
addSettingsChanges(settings_changes_history, "25.2",
438443
{
439444
/// Release closed. Please use 25.3

src/Interpreters/InterpreterSystemQuery.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@
7777
#include <Formats/ProtobufSchemas.h>
7878
#endif
7979

80+
#if USE_PARQUET
81+
#include <Processors/Formats/Impl/ParquetFileMetaDataCache.h>
82+
#endif
83+
8084
#if USE_AWS_S3
8185
#include <IO/S3/Client.h>
8286
#endif
@@ -453,6 +457,16 @@ BlockIO InterpreterSystemQuery::execute()
453457
getContext()->clearQueryResultCache(query.query_result_cache_tag);
454458
break;
455459
}
460+
case Type::DROP_PARQUET_METADATA_CACHE:
461+
{
462+
#if USE_PARQUET
463+
getContext()->checkAccess(AccessType::SYSTEM_DROP_PARQUET_METADATA_CACHE);
464+
ParquetFileMetaDataCache::instance()->clear();
465+
break;
466+
#else
467+
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "The server was compiled without the support for Parquet");
468+
#endif
469+
}
456470
case Type::CLEAR_COMPILED_EXPRESSION_CACHE:
457471
#if USE_EMBEDDED_COMPILER
458472
getContext()->checkAccess(AccessType::SYSTEM_DROP_COMPILED_EXPRESSION_CACHE);
@@ -1994,10 +2008,12 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
19942008
case Type::CLEAR_FILESYSTEM_CACHE:
19952009
case Type::CLEAR_DISTRIBUTED_CACHE:
19962010
case Type::SYNC_FILESYSTEM_CACHE:
1997-
case Type::CLEAR_PAGE_CACHE:
1998-
case Type::CLEAR_SCHEMA_CACHE:
1999-
case Type::CLEAR_FORMAT_SCHEMA_CACHE:
2000-
case Type::CLEAR_S3_CLIENT_CACHE:
2011+
case Type::DROP_PAGE_CACHE:
2012+
case Type::DROP_SCHEMA_CACHE:
2013+
case Type::DROP_FORMAT_SCHEMA_CACHE:
2014+
case Type::DROP_PARQUET_METADATA_CACHE:
2015+
case Type::DROP_S3_CLIENT_CACHE:
2016+
case Type::DROP_PARQUET_METADATA_CACHE:
20012017
{
20022018
required_access.emplace_back(AccessType::SYSTEM_DROP_CACHE);
20032019
break;

src/Parsers/ASTSystemQuery.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,7 @@ void ASTSystemQuery::formatImpl(WriteBuffer & ostr, const FormatSettings & setti
575575
case Type::CLEAR_TEXT_INDEX_CACHES:
576576
case Type::CLEAR_COMPILED_EXPRESSION_CACHE:
577577
case Type::CLEAR_S3_CLIENT_CACHE:
578+
case Type::DROP_PARQUET_METADATA_CACHE:
578579
case Type::CLEAR_ICEBERG_METADATA_CACHE:
579580
case Type::RESET_COVERAGE:
580581
case Type::RESTART_REPLICAS:

src/Parsers/ASTSystemQuery.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class ASTSystemQuery : public IAST, public ASTQueryWithOnCluster
4343
CLEAR_QUERY_CONDITION_CACHE,
4444
CLEAR_QUERY_CACHE,
4545
CLEAR_COMPILED_EXPRESSION_CACHE,
46+
DROP_PARQUET_METADATA_CACHE,
4647
CLEAR_ICEBERG_METADATA_CACHE,
4748
CLEAR_FILESYSTEM_CACHE,
4849
CLEAR_DISTRIBUTED_CACHE,

src/Processors/Formats/IInputFormat.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <base/types.h>
99
#include <Core/BlockMissingValues.h>
1010
#include <Processors/ISource.h>
11+
#include <Core/Settings.h>
1112

1213

1314
namespace DB
@@ -128,6 +129,9 @@ class IInputFormat : public ISource
128129

129130
void needOnlyCount() { need_only_count = true; }
130131

132+
/// Set additional info/key/id related to underlying storage of the ReadBuffer
133+
virtual void setStorageRelatedUniqueKey(const Settings & /*settings*/, const String & /*key*/) {}
134+
131135
protected:
132136
ReadBuffer & getReadBuffer() const { chassert(in); return *in; }
133137

0 commit comments

Comments
 (0)