Skip to content

Commit 3b5b2a6

Browse files
cxzl25dongjoon-hyun
authored andcommitted
ORC-1667: Add check tool to check the index of the specified column
### What changes were proposed in this pull request? This PR aims to check the index of the specified column. We can test the filtering effect by specifying different types. `check --type stat` - Only use column statistics. `check --type bloom-filter` - Only use bloom filter. `check --type predicate` - Used in combination with column statistics and bloom filter. ### Why are the changes needed? ORC supports specifying multiple columns to generate bloom filter indexes, but it lacks a convenient tool to verify the effect of bloom filter. Parquet also has similar commands. [PARQUET-2138](https://issues.apache.org/jira/browse/PARQUET-2138): Add ShowBloomFilterCommand to parquet-cli ### How was this patch tested? Add UT ### Was this patch authored or co-authored using generative AI tooling? No Closes #1862 from cxzl25/ORC-1667. Authored-by: sychen <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent 1830e86 commit 3b5b2a6

File tree

4 files changed

+571
-0
lines changed

4 files changed

+571
-0
lines changed
Lines changed: 336 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
* <p/>
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
* <p/>
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.orc.tools;
20+
21+
import org.apache.commons.cli.CommandLine;
22+
import org.apache.commons.cli.DefaultParser;
23+
import org.apache.commons.cli.HelpFormatter;
24+
import org.apache.commons.cli.Option;
25+
import org.apache.commons.cli.Options;
26+
import org.apache.hadoop.conf.Configuration;
27+
import org.apache.hadoop.fs.FileSystem;
28+
import org.apache.hadoop.fs.LocatedFileStatus;
29+
import org.apache.hadoop.fs.Path;
30+
import org.apache.hadoop.fs.RemoteIterator;
31+
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
32+
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
33+
import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl;
34+
import org.apache.orc.ColumnStatistics;
35+
import org.apache.orc.OrcFile;
36+
import org.apache.orc.OrcProto;
37+
import org.apache.orc.OrcUtils;
38+
import org.apache.orc.Reader;
39+
import org.apache.orc.StripeInformation;
40+
import org.apache.orc.TypeDescription;
41+
import org.apache.orc.impl.ColumnStatisticsImpl;
42+
import org.apache.orc.impl.OrcIndex;
43+
import org.apache.orc.impl.RecordReaderImpl;
44+
import org.apache.orc.util.BloomFilter;
45+
import org.apache.orc.util.BloomFilterIO;
46+
47+
import java.util.ArrayList;
48+
import java.util.List;
49+
50+
/**
51+
* Check whether the specified column of multiple ORC files can filter the specified value.
52+
*/
53+
public class CheckTool {
54+
55+
private static final String CHECK_TYPE_PREDICATE = "predicate";
56+
private static final String CHECK_TYPE_STAT = "stat";
57+
private static final String CHECK_TYPE_BLOOM_FILTER = "bloom-filter";
58+
59+
public static void main(Configuration conf, String[] args) throws Exception {
60+
Options opts = createOptions();
61+
CommandLine cli = new DefaultParser().parse(opts, args);
62+
HelpFormatter formatter = new HelpFormatter();
63+
if (cli.hasOption('h')) {
64+
formatter.printHelp("check", opts);
65+
return;
66+
}
67+
68+
String type = cli.getOptionValue("type");
69+
if (type == null ||
70+
(!type.equals(CHECK_TYPE_PREDICATE) &&
71+
!type.equals(CHECK_TYPE_STAT) &&
72+
!type.equals(CHECK_TYPE_BLOOM_FILTER))) {
73+
System.err.printf("type %s not support %n", type);
74+
formatter.printHelp("check", opts);
75+
return;
76+
}
77+
String column = cli.getOptionValue("column");
78+
if (column == null || column.isEmpty()) {
79+
System.err.println("column is null");
80+
formatter.printHelp("check", opts);
81+
return;
82+
}
83+
String[] values = cli.getOptionValues("values");
84+
if (values == null || values.length == 0) {
85+
System.err.println("values is null");
86+
formatter.printHelp("check", opts);
87+
return;
88+
}
89+
boolean ignoreExtension = cli.hasOption("ignoreExtension");
90+
91+
List<Path> inputFiles = new ArrayList<>();
92+
String[] files = cli.getArgs();
93+
for (String root : files) {
94+
Path rootPath = new Path(root);
95+
FileSystem fs = rootPath.getFileSystem(conf);
96+
for (RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, true); itr.hasNext(); ) {
97+
LocatedFileStatus status = itr.next();
98+
if (status.isFile() && (ignoreExtension || status.getPath().getName().endsWith(".orc"))) {
99+
inputFiles.add(status.getPath());
100+
}
101+
}
102+
}
103+
if (inputFiles.isEmpty()) {
104+
System.err.println("No files found.");
105+
System.exit(1);
106+
}
107+
108+
for (Path inputFile : inputFiles) {
109+
System.out.println("input file: " + inputFile);
110+
FileSystem fs = inputFile.getFileSystem(conf);
111+
try (Reader reader = OrcFile.createReader(inputFile,
112+
OrcFile.readerOptions(conf).filesystem(fs))) {
113+
RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
114+
TypeDescription schema = reader.getSchema();
115+
boolean[] includedColumns = OrcUtils.includeColumns(column, schema);
116+
int colIndex = -1;
117+
for (int i = 0; i < includedColumns.length; i++) {
118+
if (includedColumns[i]) {
119+
colIndex = i;
120+
break;
121+
}
122+
}
123+
if (colIndex == -1) {
124+
System.err.printf("column: %s not found in file: %s%n", column, inputFile);
125+
continue;
126+
}
127+
int stripeIndex = -1;
128+
for (StripeInformation stripe : reader.getStripes()) {
129+
++stripeIndex;
130+
131+
OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
132+
133+
OrcProto.ColumnEncoding columnEncoding = footer.getColumns(colIndex);
134+
TypeDescription subtype = reader.getSchema().findSubtype(colIndex);
135+
TypeDescription.Category columnCategory = subtype.getCategory();
136+
OrcIndex indices = rows.readRowIndex(stripeIndex, null, includedColumns);
137+
if (type.equals(CHECK_TYPE_BLOOM_FILTER)) {
138+
checkBloomFilter(inputFile, reader, indices, stripeIndex,
139+
colIndex, column, columnEncoding, columnCategory, values);
140+
} else {
141+
checkStatOrPredicate(inputFile, reader, indices, stripeIndex,
142+
colIndex, column, columnEncoding, subtype, columnCategory, values, type);
143+
}
144+
}
145+
}
146+
}
147+
}
148+
149+
private static void checkStatOrPredicate(Path inputFile,
150+
Reader reader,
151+
OrcIndex indices,
152+
int stripeIndex,
153+
int colIndex,
154+
String column,
155+
OrcProto.ColumnEncoding columnEncoding,
156+
TypeDescription subtype,
157+
TypeDescription.Category columnCategory,
158+
String[] values,
159+
String type) {
160+
OrcProto.RowIndex rowGroupIndex = indices.getRowGroupIndex()[colIndex];
161+
int entryCount = rowGroupIndex.getEntryCount();
162+
boolean hasBloomFilter = true;
163+
OrcProto.BloomFilterIndex[] bloomFilterIndices = indices.getBloomFilterIndex();
164+
OrcProto.BloomFilterIndex bloomFilterIndex = bloomFilterIndices[colIndex];
165+
if (bloomFilterIndex == null || bloomFilterIndex.getBloomFilterList().isEmpty()) {
166+
hasBloomFilter = false;
167+
}
168+
for (int i = 0; i < entryCount; i++) {
169+
OrcProto.ColumnStatistics statistics = rowGroupIndex.getEntry(i).getStatistics();
170+
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(subtype,
171+
statistics,
172+
reader.writerUsedProlepticGregorian(),
173+
reader.getConvertToProlepticGregorian());
174+
175+
BloomFilter bloomFilter = null;
176+
if (type.equals(CHECK_TYPE_PREDICATE) && hasBloomFilter) {
177+
bloomFilter = BloomFilterIO.deserialize(
178+
indices.getBloomFilterKinds()[colIndex], columnEncoding,
179+
reader.getWriterVersion(), columnCategory, bloomFilterIndex.getBloomFilter(i));
180+
}
181+
182+
for (String value : values) {
183+
PredicateLeaf predicateLeaf = createPredicateLeaf(PredicateLeaf.Operator.EQUALS,
184+
getPredicateLeafType(columnCategory), column, convert(columnCategory, value));
185+
SearchArgument.TruthValue truthValue = RecordReaderImpl.evaluatePredicate(
186+
cs, predicateLeaf, bloomFilter);
187+
System.out.printf("stripe: %d, rowIndex: %d, value: %s, test value: %s%n",
188+
stripeIndex, i, value, truthValue);
189+
}
190+
}
191+
}
192+
193+
private static void checkBloomFilter(Path inputFile,
194+
Reader reader,
195+
OrcIndex indices,
196+
int stripeIndex,
197+
int colIndex,
198+
String column,
199+
OrcProto.ColumnEncoding columnEncoding,
200+
TypeDescription.Category columnCategory,
201+
String[] values) {
202+
OrcProto.BloomFilterIndex[] bloomFilterIndices = indices.getBloomFilterIndex();
203+
OrcProto.BloomFilterIndex bloomFilterIndex = bloomFilterIndices[colIndex];
204+
if (bloomFilterIndex == null || bloomFilterIndex.getBloomFilterList().isEmpty()) {
205+
System.err.printf("The bloom filter index for column: %s is not found in file: %s%n",
206+
column, inputFile);
207+
return;
208+
}
209+
List<OrcProto.BloomFilter> bloomFilterList = bloomFilterIndex.getBloomFilterList();
210+
for (int i = 0; i < bloomFilterList.size(); i++) {
211+
OrcProto.BloomFilter bf = bloomFilterList.get(i);
212+
org.apache.orc.util.BloomFilter bloomFilter = BloomFilterIO.deserialize(
213+
indices.getBloomFilterKinds()[colIndex], columnEncoding,
214+
reader.getWriterVersion(), columnCategory, bf);
215+
for (String value : values) {
216+
boolean testResult = test(bloomFilter, columnCategory, value);
217+
if (testResult) {
218+
System.out.printf("stripe: %d, rowIndex: %d, value: %s, bloom filter: maybe exist%n",
219+
stripeIndex, i, value);
220+
} else {
221+
System.out.printf("stripe: %d, rowIndex: %d, value: %s, bloom filter: not exist%n",
222+
stripeIndex, i, value);
223+
}
224+
}
225+
}
226+
}
227+
228+
private static boolean test(BloomFilter bloomFilter,
229+
TypeDescription.Category columnCategory, String value) {
230+
switch (columnCategory){
231+
case BYTE:
232+
case SHORT:
233+
case INT:
234+
case LONG:
235+
case DATE:
236+
case TIMESTAMP:
237+
return bloomFilter.testLong(Long.parseLong(value));
238+
case FLOAT:
239+
case DOUBLE:
240+
return bloomFilter.testDouble(Double.parseDouble(value));
241+
case STRING:
242+
case CHAR:
243+
case VARCHAR:
244+
case DECIMAL:
245+
return bloomFilter.testString(value);
246+
default:
247+
throw new IllegalStateException("Not supported type:" + columnCategory);
248+
}
249+
}
250+
251+
private static Object convert(
252+
TypeDescription.Category columnCategory, String value) {
253+
switch (columnCategory) {
254+
case BYTE:
255+
case SHORT:
256+
case INT:
257+
case LONG:
258+
case DATE:
259+
case TIMESTAMP:
260+
return Long.parseLong(value);
261+
case FLOAT:
262+
case DOUBLE:
263+
return Double.parseDouble(value);
264+
case STRING:
265+
case CHAR:
266+
case VARCHAR:
267+
case DECIMAL:
268+
return value;
269+
default:
270+
throw new IllegalStateException("Not supported type:" + columnCategory);
271+
}
272+
}
273+
274+
private static PredicateLeaf.Type getPredicateLeafType(TypeDescription.Category columnCategory) {
275+
switch (columnCategory){
276+
case BOOLEAN:
277+
return PredicateLeaf.Type.BOOLEAN;
278+
case BYTE:
279+
case SHORT:
280+
case INT:
281+
case LONG:
282+
return PredicateLeaf.Type.LONG;
283+
case DATE:
284+
return PredicateLeaf.Type.DATE;
285+
case TIMESTAMP:
286+
return PredicateLeaf.Type.TIMESTAMP;
287+
case FLOAT:
288+
case DOUBLE:
289+
return PredicateLeaf.Type.FLOAT;
290+
case STRING:
291+
case CHAR:
292+
case VARCHAR:
293+
case DECIMAL:
294+
return PredicateLeaf.Type.STRING;
295+
default:
296+
throw new IllegalStateException("Not supported type:" + columnCategory);
297+
}
298+
}
299+
300+
private static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator,
301+
PredicateLeaf.Type type,
302+
String columnName,
303+
Object literal) {
304+
return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName,
305+
literal, null);
306+
}
307+
308+
private static Options createOptions() {
309+
Options result = new Options();
310+
311+
result.addOption(Option.builder("t")
312+
.longOpt("type")
313+
.desc(String.format("check type = {%s, %s, %s}",
314+
CHECK_TYPE_PREDICATE, CHECK_TYPE_STAT, CHECK_TYPE_BLOOM_FILTER))
315+
.hasArg()
316+
.build());
317+
318+
result.addOption(Option.builder("col")
319+
.longOpt("column")
320+
.desc("column name")
321+
.hasArg()
322+
.build());
323+
324+
result.addOption(Option.builder("v")
325+
.longOpt("values")
326+
.desc("test values")
327+
.hasArgs()
328+
.build());
329+
330+
result.addOption(Option.builder("h")
331+
.longOpt("help")
332+
.desc("print help message")
333+
.build());
334+
return result;
335+
}
336+
}

java/tools/src/java/org/apache/orc/tools/Driver.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ public static void main(String[] args) throws Exception {
8686
" [--define X=Y] <command> <args>");
8787
System.err.println();
8888
System.err.println("Commands:");
89+
System.err.println(" check - check the index of the specified column");
8990
System.err.println(" convert - convert CSV/JSON/ORC files to ORC");
9091
System.err.println(" count - recursively find *.orc and print the number of rows");
9192
System.err.println(" data - print the data from the ORC file");
@@ -106,6 +107,9 @@ public static void main(String[] args) throws Exception {
106107
conf.set(pair.getKey().toString(), pair.getValue().toString());
107108
}
108109
switch (options.command) {
110+
case "check":
111+
CheckTool.main(conf, options.commandArgs);
112+
break;
109113
case "convert":
110114
ConvertTool.main(conf, options.commandArgs);
111115
break;

0 commit comments

Comments
 (0)