Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
def run1(args, src_name, num_runs):
t = []
for i in range(num_runs):
tm = time.perf_counter()

tm = time.perf_counterperf_counter() if sys.version_info[0] == 3 else time.clock()
r = subprocess.Popen(args + ['pg.txt', 'out.txt'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
r.wait()
t.append(time.perf_counter() - tm)
t.append((time.perf_counter() if sys.version_info[0] == 3 else time.clock()) - tm)
t = sorted(t)
print('| %.3f..%.3fs | %.1f | %s |' % (t[0], t[-1], SIZE / t[0] / 1000000, src_name))

Expand Down
63 changes: 37 additions & 26 deletions src/hack01.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ extern "C" {
static constexpr uint64_t offset_basis = 14695981039346656037LU;
static constexpr uint64_t prime = 1099511628211;

inline uint64_t update_hash(uint64_t h, char ch) { return (uint64_t(h) * prime) ^ ch; }
inline uint64_t update_hash(uint64_t h, char ch) { return (uint64_t(h) ^ ch) * prime; }

int usage(char *process_name) {
std::cout << "Usage: " << process_name << " <filename>" << std::endl;
Expand All @@ -62,12 +62,33 @@ using Dict = ska::flat_hash_map<uint64_t, DictValue,
HashKey>; //works with std::unordered_map as well, and still faster
//using Dict = std::unordered_map<uint64_t, DictValue, HashKey>;

const std::vector<std::string> *strings = nullptr;
const uint8_t *all_data = nullptr, *end_data = nullptr;
const char *all_letters = nullptr;

static void lower_to(const uint8_t *data, const uint8_t *end_data, const char *letters, char *buf)
{
for(;data != end_data;++data, ++buf){
*buf = letters[*data];
if (!*buf)
return;
}
}

static bool compare_insensitive(const uint8_t *a, const uint8_t *b)
{
for(const uint8_t *ab = a<b ? b : a; ab != end_data;++a,++b){
char chA = all_letters[*a], chB = all_letters[*b];
if (chA == chB)
continue;
return chA < chB;
}
return a>b;
}

struct IndicesIterator {
bool operator()(const DictValue &a, const DictValue &b) {
if (a.second == b.second) {
return (*strings)[a.first] < (*strings)[b.first];
return compare_insensitive(all_data+a.first, all_data+b.first);
}

return a.second > b.second;
Expand All @@ -88,7 +109,7 @@ int main(int argc, char **argv) {

const uint8_t *begin = reinterpret_cast<const uint8_t *>(
mmap(NULL, fsz, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0));

const uint8_t *cbegin = begin;
char letters[256];
for (size_t i = 0; i < 256; ++i) {
letters[i] = letterize(i);
Expand All @@ -100,21 +121,16 @@ int main(int argc, char **argv) {
}

Dict dict(500000);
std::vector<std::string> all_strings;
all_strings.reserve(500000);

char buf[256];
size_t clen = 0;

size_t cnt = 0;
uint32_t count = 0;
uint64_t key_hash = offset_basis;
for (auto end = begin + fsz; begin != end; ++begin) {
const auto ch = letters[*begin];

if (ch) {
key_hash = update_hash(key_hash, ch);
buf[clen++] = ch;
clen++;
continue;
}

Expand All @@ -126,51 +142,46 @@ int main(int argc, char **argv) {
// end of word
auto it = dict.find(key_hash);
if (it == dict.end()) {
dict.insert(it, {key_hash, DictValue(count, 1)});
buf[clen] = 0;
all_strings.push_back(buf);
++count;
dict.insert(it, {key_hash, DictValue(uint32_t(begin-cbegin-clen), 1)});
} else {
++(it->second.second);
}
++cnt;
key_hash = offset_basis;
buf[clen = 0] = 0;
clen=0;
}
std::cout << cnt << std::endl;
std::cout << count << std::endl;

// last word
if (key_hash != offset_basis) {
auto it = dict.find(key_hash);
if (it == dict.end()) {
dict.insert(it, {key_hash, DictValue(count, 1)});
buf[clen] = 0;
all_strings.push_back(buf);
++count;
dict.insert(it, {key_hash, DictValue(uint32_t(begin-cbegin-clen), 1)});
} else {
++(it->second.second);
}
}
close(fd);

std::vector<DictValue> freqs;
freqs.resize(count);
freqs.resize(dict.size());
uint32_t fi = 0;
for (auto &d : dict)
freqs[fi++] = d.second;

strings = &all_strings;
all_data = cbegin;
end_data = begin;
all_letters = letters;
std::sort(freqs.begin(), freqs.end(), IndicesIterator()); //we can instead sort indices, ofc

FILE *out = fopen(argv[2], "w");
if (!out) {
std::cerr << "Can't write file" << argv[2] << std::endl;
exit(1);
}
char buf[256];
for (auto i : freqs) {
fprintf(out, "%d %s\n", i.second, all_strings[i.first].c_str());
lower_to(cbegin + i.first, end_data, letters, buf);
fprintf(out, "%d %s\n", i.second, buf);
}
fclose(out);
close(fd);
return 0;
}