Skip to content

Commit 584ea0e

Browse files
author
Your Name
committed
Changes to set max sample size
Ignore address frequency Be more permissive matching strings Vendored Support zero based images
1 parent 8b45728 commit 584ea0e

File tree

2,182 files changed

+1006735
-31
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

2,182 files changed

+1006735
-31
lines changed

.cargo/config.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[source.crates-io]
2+
replace-with = "vendored-sources"
3+
4+
[source.vendored-sources]
5+
directory = "vendor"

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "rbase"
3-
version = "0.1.6"
3+
version = "0.1.7"
44
edition = "2021"
55
repository = "https://github.com/WorksButNotTested/rbase.git"
66
authors = [ "WorksButNotTested" ]

src/main.rs

Lines changed: 59 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,22 @@ pub struct Args {
9292
default_value = "8"
9393
)]
9494
pub jobs: usize,
95+
96+
#[arg(
97+
short = 's',
98+
long = "max-strings",
99+
help = "Maximum number of strings to sample",
100+
default_value = "100000"
101+
)]
102+
pub max_strings: usize,
103+
104+
#[arg(
105+
short = 'a',
106+
long = "max-addresses",
107+
help = "Maximum number of addresses to sample",
108+
default_value = "1000000"
109+
)]
110+
pub max_addresses: usize,
95111
}
96112

97113
impl Args {
@@ -169,7 +185,7 @@ impl<
169185
}
170186

171187
fn get_string_offsets(min: usize, max: usize, chunks: Vec<(usize, &[u8])>) -> DashSet<T> {
172-
let regex = format!("([a-zA-Z0-9_]{{{},{}}})\0", min, max);
188+
let regex = format!("([[:print:][:space:]]{{{},{}}})\0", min, max);
173189
let re = Regex::new(&regex).unwrap();
174190
let pb = Self::get_progress_bar("Finding strings", chunks.len());
175191
let set = DashSet::<T>::new();
@@ -182,14 +198,13 @@ impl<
182198
set.insert(k);
183199
});
184200
});
185-
println!("Found: {:?} strings", set.len());
186201
set
187202
}
188203

189-
fn index_strings_by_page_offset(addresses: DashSet<T>) -> DashMap<T, Vec<T>> {
190-
let pb = Self::get_progress_bar("Indexing strings", addresses.len());
204+
fn index_strings_by_page_offset(strings: DashSet<T>) -> DashMap<T, Vec<T>> {
205+
let pb = Self::get_progress_bar("Indexing strings", strings.len());
191206
let map = DashMap::<T, Vec<T>>::new();
192-
addresses.into_par_iter().progress_with(pb).for_each(|k| {
207+
strings.into_par_iter().progress_with(pb).for_each(|k| {
193208
let offset = k & T::try_from(PAGE_OFFSET_MASK).unwrap();
194209
if let Some(mut v) = map.get_mut(&offset) {
195210
v.push(k);
@@ -203,37 +218,40 @@ impl<
203218
pub fn get_strings_by_page_offset(min: usize, max: usize, bytes: &[u8]) -> DashMap<T, Vec<T>> {
204219
let chunks = Self::get_overlapping_chunks(bytes, max - 1);
205220
let addresses = Self::get_string_offsets(min, max, chunks);
206-
println!("Found: {:?} unique strings", addresses.len());
221+
println!("Found: {:?} strings", addresses.len());
207222
let index = Self::index_strings_by_page_offset(addresses);
208223
index
209224
}
210225

211226
/* Addresses */
212-
fn get_address_frequencies<F: Fn(&[u8]) -> T + Sync + Send>(
227+
fn get_addresses<F: Fn(&[u8]) -> T + Sync + Send>(
213228
bytes: &[u8],
214229
convert: F,
215-
) -> DashMap<T, usize> {
230+
) -> DashSet<T> {
216231
let chunks = bytes.chunks(size_of::<T>()).collect::<Vec<&[u8]>>();
217232
let pb = Self::get_progress_bar("Reading addresses", chunks.len());
218-
let map = DashMap::<T, usize>::new();
233+
let set = DashSet::<T>::new();
219234
chunks
220235
.into_par_iter()
221236
.progress_with(pb)
222237
.map(|p| convert(p))
223238
.filter(|&p| p != T::default())
224239
.for_each(|ptr| {
225-
*map.entry(ptr).or_insert(0) += 1;
240+
set.insert(ptr);
226241
});
227-
map
242+
set
228243
}
229244

230-
fn index_unique_addresses_by_page_offset(frequencies: DashMap<T, usize>) -> DashMap<T, Vec<T>> {
245+
fn index_addresses_by_page_offset(
246+
addresses: DashSet<T>,
247+
max_addresses: usize,
248+
) -> DashMap<T, Vec<T>> {
231249
let map = DashMap::<T, Vec<T>>::new();
232-
let pb = Self::get_progress_bar("Finding unique addresses", frequencies.len());
233-
frequencies
250+
let pb = Self::get_progress_bar("Indexing addresses", addresses.len());
251+
addresses
234252
.into_par_iter()
253+
.take_any(max_addresses)
235254
.progress_with(pb)
236-
.filter_map(|(k, v)| if v > 1 { Some(k) } else { None })
237255
.for_each(|k| {
238256
let offset = k & T::try_from(PAGE_OFFSET_MASK).unwrap();
239257
if let Some(mut v) = map.get_mut(&offset) {
@@ -248,13 +266,13 @@ impl<
248266
pub fn get_addresses_by_page_offset<F: Fn(&[u8]) -> T + Sync + Send + Copy>(
249267
bytes: &[u8],
250268
convert: F,
269+
max_addresses: usize,
251270
) -> DashMap<T, Vec<T>> {
252-
let frequencies = Self::get_address_frequencies(bytes, convert);
253-
println!("Found: {:?} addresses", frequencies.len());
271+
let addresses = Self::get_addresses(bytes, convert);
272+
println!("Found: {:?} addresses", addresses.len());
254273

255-
let addresses = Self::index_unique_addresses_by_page_offset(frequencies);
256-
println!("Found: {:?} unique addresses", addresses.len());
257-
addresses
274+
let index = Self::index_addresses_by_page_offset(addresses, max_addresses);
275+
index
258276
}
259277

260278
/* Addresses */
@@ -268,7 +286,7 @@ impl<
268286
let (offset, strings) = r.pair();
269287
if let Some(addresses) = addresses.get(offset) {
270288
for &s in strings.iter() {
271-
for &a in addresses.iter().filter(|&&a| a > s) {
289+
for &a in addresses.iter().filter(|&&a| a >= s) {
272290
*map.entry(a - s).or_insert(0) += 1;
273291
}
274292
}
@@ -290,7 +308,7 @@ impl<
290308
pub fn get_most_frequent_candidate_base_address(
291309
strings: &DashMap<T, Vec<T>>,
292310
addresses: &DashMap<T, Vec<T>>,
293-
) -> T {
311+
) -> Option<T> {
294312
let base_addresses = Self::get_candidate_base_addresses(strings, addresses);
295313
let num_candidates = base_addresses.len();
296314
println!("Found: {:?} candidates", num_candidates);
@@ -301,11 +319,11 @@ impl<
301319
let sorted = Self::sort_candidate_base_addresses_by_frequency(filtered);
302320
for (idx, (base, frequency)) in sorted.iter().take(10).enumerate() {
303321
let pct = 100.0 * (*frequency as f64) / (num_candidates as f64);
304-
println!("{:2}: {base:x}: {frequency} ({pct:.2}%)", idx + 1);
322+
println!("{:2}: {base:0x}: {frequency} ({pct:.2}%)", idx + 1);
305323
}
306324

307-
let (base, _frequency) = sorted.first().unwrap().clone();
308-
base
325+
let (base, _frequency) = sorted.first().cloned()?;
326+
Some(base)
309327
}
310328
}
311329

@@ -328,9 +346,15 @@ fn main() {
328346
Endian::Little => |bytes: &[u8]| u32::from_le_bytes(bytes.try_into().unwrap()),
329347
Endian::Big => |bytes: &[u8]| u32::from_be_bytes(bytes.try_into().unwrap()),
330348
},
349+
args.max_addresses,
331350
);
332-
let base = RBase::get_most_frequent_candidate_base_address(&strings, &addresses);
333-
println!("Found base: {:x}", base);
351+
if let Some(base) =
352+
RBase::get_most_frequent_candidate_base_address(&strings, &addresses)
353+
{
354+
println!("Found base: {:0x}", base);
355+
} else {
356+
println!("No base found");
357+
}
334358
}
335359
Size::Bits64 => {
336360
let strings = RBase::get_strings_by_page_offset(args.min, args.max, bytes);
@@ -340,9 +364,15 @@ fn main() {
340364
Endian::Little => |bytes: &[u8]| u64::from_le_bytes(bytes.try_into().unwrap()),
341365
Endian::Big => |bytes: &[u8]| u64::from_be_bytes(bytes.try_into().unwrap()),
342366
},
367+
args.max_addresses,
343368
);
344-
let base = RBase::get_most_frequent_candidate_base_address(&strings, &addresses);
345-
println!("Found base: {:x}", base);
369+
if let Some(base) =
370+
RBase::get_most_frequent_candidate_base_address(&strings, &addresses)
371+
{
372+
println!("Found base: {:x}", base);
373+
} else {
374+
println!("No base found");
375+
}
346376
}
347377
};
348378
let end = start.elapsed();
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"88c12a803c6c06c47cd9dabc8bcdba81f35d3bab637221d2106a86a543532731","DESIGN.md":"59c960e1b73b1d7fb41e4df6c0c1b1fcf44dd2ebc8a349597a7d0595f8cb5130","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"afc4d559a98cf190029af0bf320fc0022725e349cd2a303aac860254e28f3c53","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/ahocorasick.rs":"c699c07df70be45c666e128509ad571a7649d2073e4ae16ac1efd6793c9c6890","src/automaton.rs":"22258a3e118672413119f8f543a9b912cce954e63524575c0ebfdf9011f9c2dd","src/dfa.rs":"bfef1a94c5e7410584b1beb4e857b40d1ae2031b881cbc06fb1300409bbd555f","src/lib.rs":"2a92d5c5e930f2d306508802e8a929135e1f41c9f5f8deda8f7eb98947179dd2","src/macros.rs":"c6c52ae05b24433cffaca7b78b3645d797862c5d5feffddf9f54909095ed6e05","src/nfa/contiguous.rs":"aeb6ee5fd80eea04decbc4b46aa27d1ab270b78d416a644da25b7934f009ee66","src/nfa/mod.rs":"ee7b3109774d14bbad5239c16bb980dd6b8185ec136d94fbaf2f0dc27d5ffa15","src/nfa/noncontiguous.rs":"de94f02b04efd8744fb096759a8897c22012b0e0ca3ace161fd87c71befefe04","src/packed/api.rs":"160d3b10823316f7b0924e13c3afd222c8a7db5c0a00432401f311ef27d6a1b7","src/packed/ext.rs":"66be06fde8558429da23a290584d4b9fae665bf64c2578db4fe5f5f3ee864869","src/packed/mod.rs":"0020cd6f07ba5c8955923a9516d7f758864260eda53a6b6f629131c45ddeec62","src/packed/pattern.rs":"1e3a289a730c141fc30b295811e372d046c6619c7fd670308299b889a06c7673","src/packed/rabinkarp.rs":"403146eb1d838a84601d171393542340513cd1ee7ff750f2372161dd47746586","src/packed/teddy/README.md":"3a43194b64e221543d885176aba3beb1224a927385a20eca842daf6b0ea2f342","src/packed/teddy/builder.rs":"08ec116a4a842a2bb1221d296a2515ef3672c54906bed588fb733364c07855d3","src/packed/teddy/generic.rs":"ea252ab05b32cea7dd9d71e332071d243db7dd0362e049252a27e5881ba2bf39","src/packed/teddy/mod.rs":"17d741f7e2fb9dbac5ba7d1bd4542cf1e35e9f146ace728e23fe6bbed20028b2","src/packed/tests.rs":"8e2f56eb3890ed3876ecb47d3121996e416563127b6430110d7b516df3f83b4b","src/packed/vector.rs":"70c325cfa6f7c5c4c9a6af7b133b75a29e65990a7fe0b9a4c4ce3c3d5a0fe587","src/tests.rs":"c68192ab97b6161d0d6ee96fefd80cc7d14e4486ddcd8d1f82b5c92432c24ed5","src/transducer.rs":"02daa33a5d6dac41dcfd67f51df7c0d4a91c5131c781fb54c4de3520c585a6e1","src/util/alphabet.rs":"6dc22658a38deddc0279892035b18870d4585069e35ba7c7e649a24509acfbcc","src/util/buffer.rs":"f9e37f662c46c6ecd734458dedbe76c3bb0e84a93b6b0117c0d4ad3042413891","src/util/byte_frequencies.rs":"2fb85b381c038c1e44ce94294531cdcd339dca48b1e61f41455666e802cbbc9e","src/util/debug.rs":"ab301ad59aa912529cb97233a54a05914dd3cb2ec43e6fec7334170b97ac5998","src/util/error.rs":"ecccd60e7406305023efcc6adcc826eeeb083ab8f7fbfe3d97469438cd4c4e5c","src/util/int.rs":"e264e6abebf5622b59f6500210773db36048371c4e509c930263334095959a52","src/util/mod.rs":"7ab28d11323ecdbd982087f32eb8bceeee84f1a2583f3aae27039c36d58cf12c","src/util/prefilter.rs":"9fa4498f18bf70478b1996c1a013698b626d15f119aa81dbc536673c9f045718","src/util/primitives.rs":"f89f3fa1d8db4e37de9ca767c6d05e346404837cade6d063bba68972fafa610b","src/util/remapper.rs":"9f12d911583a325c11806eeceb46d0dfec863cfcfa241aed84d31af73da746e5","src/util/search.rs":"6af803e08b8b8c8a33db100623f1621b0d741616524ce40893d8316897f27ffe","src/util/special.rs":"7d2f9cb9dd9771f59816e829b2d96b1239996f32939ba98764e121696c52b146"},"package":"8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"}

vendor/aho-corasick/COPYING

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
This project is dual-licensed under the Unlicense and MIT licenses.
2+
3+
You may use this code under the terms of either license.

vendor/aho-corasick/Cargo.toml

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
2+
#
3+
# When uploading crates to the registry Cargo will automatically
4+
# "normalize" Cargo.toml files for maximal compatibility
5+
# with all versions of Cargo and also rewrite `path` dependencies
6+
# to registry (e.g., crates.io) dependencies.
7+
#
8+
# If you are reading this file be aware that the original Cargo.toml
9+
# will likely look very different (and much more reasonable).
10+
# See Cargo.toml.orig for the original contents.
11+
12+
[package]
13+
edition = "2021"
14+
rust-version = "1.60.0"
15+
name = "aho-corasick"
16+
version = "1.1.3"
17+
authors = ["Andrew Gallant <[email protected]>"]
18+
exclude = [
19+
"/aho-corasick-debug",
20+
"/benchmarks",
21+
"/tmp",
22+
]
23+
autotests = false
24+
description = "Fast multiple substring searching."
25+
homepage = "https://github.com/BurntSushi/aho-corasick"
26+
readme = "README.md"
27+
keywords = [
28+
"string",
29+
"search",
30+
"text",
31+
"pattern",
32+
"multi",
33+
]
34+
categories = ["text-processing"]
35+
license = "Unlicense OR MIT"
36+
repository = "https://github.com/BurntSushi/aho-corasick"
37+
38+
[package.metadata.docs.rs]
39+
all-features = true
40+
rustdoc-args = [
41+
"--cfg",
42+
"docsrs",
43+
"--generate-link-to-definition",
44+
]
45+
46+
[profile.bench]
47+
debug = 2
48+
49+
[profile.release]
50+
debug = 2
51+
52+
[lib]
53+
name = "aho_corasick"
54+
55+
[dependencies.log]
56+
version = "0.4.17"
57+
optional = true
58+
59+
[dependencies.memchr]
60+
version = "2.4.0"
61+
optional = true
62+
default-features = false
63+
64+
[dev-dependencies.doc-comment]
65+
version = "0.3.3"
66+
67+
[features]
68+
default = [
69+
"std",
70+
"perf-literal",
71+
]
72+
logging = ["dep:log"]
73+
perf-literal = ["dep:memchr"]
74+
std = ["memchr?/std"]

0 commit comments

Comments
 (0)