-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implementation of MapReduce using TTG #221
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,270 @@ | ||
#include <fstream> | ||
#include <algorithm> | ||
#include <iostream> | ||
#include <stdlib.h> // std::atoi, std::rand() | ||
#include <iomanip> | ||
#include <string> | ||
#include <memory> | ||
#include <map> | ||
#include <chrono> | ||
#include <filesystem> | ||
#include "ttg.h" | ||
|
||
#define BLOCK_SIZE 32 | ||
|
||
using namespace ttg; | ||
|
||
template<typename T> | ||
using Key = std::pair<std::pair<std::string, T>, T>; | ||
template<typename T> | ||
using MapKey = std::multimap<std::string, T>; | ||
|
||
namespace madness { | ||
namespace archive { | ||
template <class Archive, typename T> | ||
struct ArchiveStoreImpl<Archive, MapKey<T>> { | ||
static inline void store(const Archive& ar, const MapKey<T>& mk) { | ||
int size = mk.size();; | ||
ar & size; | ||
typename MapKey<T>::const_iterator it = mk.begin(); | ||
while (size--) { | ||
ar & it->first; | ||
ar & it->second; | ||
it++; | ||
} | ||
} | ||
}; | ||
|
||
template <class Archive, typename T> | ||
struct ArchiveLoadImpl<Archive, MapKey<T>> { | ||
static inline void load(const Archive& ar, MapKey<T>& mk) { | ||
int size; | ||
ar & size; | ||
while (size--) { | ||
std::string s; | ||
T v; | ||
ar & s; | ||
ar & v; | ||
mk.insert(std::make_pair(s, v)); | ||
} | ||
} | ||
}; | ||
} | ||
} | ||
|
||
template <typename T> | ||
std::ostream& operator<<(std::ostream& s, const Key<T>& key) { | ||
s << "Key((" << key.first.first << "," << key.first.second << "), " << key.second << ")"; | ||
return s; | ||
} | ||
|
||
template<typename T> | ||
auto make_reader(Edge<Key<T>, std::string>& mapEdge) | ||
{ | ||
auto f = [](const Key<T>& filename, std::tuple<Out<Key<T>,std::string>>& out) { | ||
//check if file exists | ||
std::ifstream fin(filename.first.first); | ||
std::filesystem::path p{filename.first.first}; | ||
|
||
std::cout << "The size of " << p.u8string() << " is " | ||
<< std::filesystem::file_size(p) << " bytes.\n"; | ||
|
||
if (!fin) { | ||
std::cout << "File not found : " << fin << std::endl; | ||
ttg_abort(); | ||
} | ||
|
||
//Read the file in chunks and send it to a mapper. | ||
std::string buffer; //reads only the first BLOCK_SIZE bytes | ||
buffer.resize(BLOCK_SIZE); | ||
int first = 0; | ||
int chunkID = 0; | ||
|
||
while(!fin.eof()) { | ||
char * b = const_cast< char * >( buffer.c_str() ); | ||
fin.read(b + first, BLOCK_SIZE ); | ||
std::streamsize s = first + fin.gcount(); | ||
buffer.resize(s); | ||
//Special handling to avoid splitting words between chunks. | ||
if (s > 0) { | ||
auto last = buffer.find_last_of(" \t\n"); | ||
first = s - last - 1; | ||
std::string tmp; | ||
if (fin) { | ||
tmp.resize(BLOCK_SIZE + first); | ||
if (first > 0) tmp.replace(0, first, buffer, last + 1, first); | ||
} | ||
buffer.resize(last); | ||
//std::cout << buffer << std::endl; | ||
send<0>(std::make_pair(std::make_pair(filename.first.first, chunkID), 0), buffer, out); | ||
buffer = tmp; | ||
chunkID++; | ||
} | ||
} | ||
}; | ||
|
||
return make_tt<Key<T>>(f, edges(), edges(mapEdge), "reader", {}, {"mapEdge"}); | ||
} | ||
|
||
template<typename T> | ||
void mapper(std::string chunk, MapKey<T>& resultMap) { | ||
//Prepare the string by removing all punctuation marks | ||
chunk.erase(std::remove_if(chunk.begin(), chunk.end(), | ||
[]( auto const& c ) -> bool { return ispunct(c); } ), chunk.end()); | ||
std::istringstream ss(chunk); | ||
std::string word; | ||
|
||
while (ss >> word) | ||
{ | ||
std::transform(word.begin(), word.end(), word.begin(), ::tolower); | ||
//std::cout << "Mapped " << word << std::endl; | ||
resultMap.insert(std::make_pair(word, 1)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if the chunk contains the same word twice? And why use a multimap in the first place? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I used multimap to be able to hold duplicate keys and have them sorted which makes counting easier in this example. |
||
} | ||
} | ||
|
||
template <typename funcT, typename T> | ||
auto make_mapper(const funcT& func, Edge<Key<T>, std::string>& mapEdge, Edge<Key<T>, MapKey<T>>& reduceEdge) | ||
{ | ||
auto f = [func](const Key<T>& key, std::string& chunk, std::tuple<Out<Key<T>, MapKey<T>>>& out) | ||
{ | ||
MapKey<T> resultMap; | ||
//Call the mapper function | ||
func(chunk, resultMap); | ||
send<0>(key, resultMap, out); | ||
}; | ||
|
||
return make_tt(f, edges(mapEdge), edges(reduceEdge), "mapper", {"mapEdge"}, {"reduceEdge"}); | ||
} | ||
|
||
template <typename funcT, typename T> | ||
auto make_reducer(const funcT& func, Edge<Key<T>, MapKey<T>>& reduceEdge, | ||
Edge<void, std::pair<std::string, T>>& writerEdge) | ||
{ | ||
auto f = [func](const Key<T>& key, MapKey<T> inputMap, | ||
std::tuple<Out<Key<T>, MapKey<T>>, | ||
Out<void, std::pair<std::string, T>>>& out) | ||
{ | ||
typename MapKey<T>::iterator iter; | ||
int value = 0; | ||
//Need a tokenID to make keys unique for recurrence | ||
int tokenID = key.second + 1; | ||
//std::cout << "Received: " << key.first.second << ":" << key.second << std::endl; | ||
|
||
iter = inputMap.begin(); | ||
|
||
//Count of elements with same key | ||
int count = inputMap.count(iter->first); | ||
if (count > 1) { | ||
while(iter != inputMap.end() && !inputMap.empty()) | ||
{ | ||
if (count == 0) count = inputMap.count(iter->first); //reload the count for each distinct key | ||
value = func(value, iter->second); | ||
count--; | ||
if (count == 0) { | ||
sendv<1>(std::make_pair(iter->first, value), out); | ||
value = 0; | ||
} | ||
inputMap.erase(iter); | ||
iter = inputMap.begin(); | ||
} | ||
if (!inputMap.empty() && iter != inputMap.end()) { | ||
send<0>(std::make_pair(key.first, tokenID), inputMap, out); | ||
} | ||
} | ||
else { | ||
sendv<1>(std::make_pair(iter->first, iter->second), out); | ||
inputMap.erase(iter); | ||
if (!inputMap.empty()) { | ||
iter = inputMap.begin(); | ||
if (iter != inputMap.end()) { | ||
send<0>(std::make_pair(key.first, tokenID), inputMap, out); | ||
} | ||
} | ||
} | ||
}; | ||
|
||
return make_tt(f, edges(reduceEdge), edges(reduceEdge, writerEdge), "reducer", {"reduceEdge"}, | ||
{"recurReduceEdge","writerEdge"}); | ||
} | ||
|
||
template<typename T> | ||
auto make_writer(std::map<std::string, T>& resultMap, Edge<void, std::pair<std::string, T>>& writerEdge) | ||
{ | ||
auto f = [&resultMap](std::pair<std::string, T> &value, std::tuple<>& out) { | ||
auto it = resultMap.find(value.first); | ||
if (it != resultMap.end()) | ||
resultMap[value.first] += value.second; | ||
else | ||
resultMap.insert(value); | ||
}; | ||
|
||
return make_tt<void>(f, edges(writerEdge), edges(), "writer", {"writerEdge"}, {}); | ||
} | ||
|
||
int main(int argc, char* argv[]) { | ||
if (argc < 2) | ||
{ | ||
std::cout << "Usage: ./mapreduce file1 [file2, ...]\n"; | ||
exit(-1); | ||
} | ||
|
||
std::chrono::time_point<std::chrono::high_resolution_clock> beg, end; | ||
|
||
ttg::ttg_initialize(argc, argv, -1); | ||
//OpBase::set_trace_all(true); | ||
|
||
Edge<Key<int>, std::string> mapEdge; | ||
Edge<Key<int>, MapKey<int>> reduceEdge; | ||
Edge<void, std::pair<std::string, int>> writerEdge; | ||
|
||
auto rd = make_reader(mapEdge); | ||
auto m = make_mapper(mapper<int>, mapEdge, reduceEdge); | ||
auto r = make_reducer(std::plus<int>(), reduceEdge, writerEdge); | ||
|
||
std::map<std::string, int> result; | ||
auto w = make_writer(result, writerEdge); | ||
|
||
int world_size = ttg::default_execution_context().size(); | ||
auto keymap = [world_size](const Key<int>& key) { | ||
//Run each chunk on a process, not efficient, just for testing! | ||
return key.first.second % world_size; | ||
}; | ||
|
||
rd->set_keymap(keymap); | ||
m->set_keymap(keymap); | ||
r->set_keymap(keymap); | ||
|
||
auto connected = make_graph_executable(rd.get()); | ||
assert(connected); | ||
TTGUNUSED(connected); | ||
//std::cout << "Graph is connected.\n"; | ||
|
||
if (ttg::ttg_default_execution_context().rank() == 0) { | ||
//std::cout << "==== begin dot ====\n"; | ||
//std::cout << Dot()(rd.get()) << std::endl; | ||
//std::cout << "==== end dot ====\n"; | ||
|
||
beg = std::chrono::high_resolution_clock::now(); | ||
for (int i = 1; i < argc; i++) { | ||
std::string s(argv[i]); | ||
rd->invoke(std::make_pair(std::make_pair(s,0),0)); | ||
} | ||
} | ||
|
||
execute(); | ||
fence(); | ||
|
||
if (ttg::default_execution_context().rank() == 0) { | ||
end = std::chrono::high_resolution_clock::now(); | ||
|
||
std::cout << "Mapreduce took " << | ||
(std::chrono::duration_cast<std::chrono::seconds>(end - beg).count()) << | ||
" seconds" << std::endl; | ||
for(auto it : result) { | ||
std::cout << it.first << " " << it.second << std::endl; | ||
} | ||
} | ||
|
||
finalize(); | ||
return 0; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
word count from Wikipedia the free encyclopedia | ||
the word count is the number of words in a document or passage of text Word counting may be needed when a text | ||
is required to stay within certain numbers of words This may particularly be the case in academia legal | ||
proceedings journalism and advertising Word count is commonly used by translators to determine the price for | ||
the translation job Word counts may also be used to calculate measures of readability and to measure typing | ||
and reading speeds usually in words per minute When converting character counts to words a measure of five or | ||
six characters to a word is generally used Contents Details and variations of definition Software In fiction | ||
In non fiction See also References Sources External links Details and variations of definition | ||
This section does not cite any references or sources Please help improve this section by adding citations to | ||
reliable sources Unsourced material may be challenged and removed | ||
Variations in the operational definitions of how to count the words can occur namely what counts as a word and | ||
which words don't count toward the total However especially since the advent of widespread word processing there | ||
is a broad consensus on these operational definitions and hence the bottom line integer result | ||
The consensus is to accept the text segmentation rules generally found in most word processing software including how | ||
word boundaries are determined which depends on how word dividers are defined The first trait of that definition is that a space any of various whitespace | ||
characters such as a regular word space an em space or a tab character is a word divider Usually a hyphen or a slash is too | ||
Different word counting programs may give varying results depending on the text segmentation rule | ||
details and on whether words outside the main text such as footnotes endnotes or hidden text) are counted But the behavior | ||
of most major word processing applications is broadly similar However during the era when school assignments were done in | ||
handwriting or with typewriters the rules for these definitions often differed from todays consensus | ||
Most importantly many students were drilled on the rule that certain words don't count usually articles namely a an the but | ||
sometimes also others such as conjunctions for example and or but and some prepositions usually to of Hyphenated permanent | ||
compounds such as follow up noun or long term adjective were counted as one word To save the time and effort of counting | ||
word by word often a rule of thumb for the average number of words per line was used such as 10 words per line These rules | ||
have fallen by the wayside in the word processing era the word count feature of such software which follows the text | ||
segmentation rules mentioned earlier is now the standard arbiter because it is largely consistent across documents and | ||
applications and because it is fast effortless and costless already included with the application As for which sections of | ||
a document count toward the total such as footnotes endnotes abstracts reference lists and bibliographies tables figure | ||
captions hidden text the person in charge teacher client can define their choice and users students workers can simply | ||
select or exclude the elements accordingly and watch the word count automatically update Software Modern web browsers | ||
support word counting via extensions via a JavaScript bookmarklet or a script that is hosted in a website Most word | ||
processors can also count words Unix like systems include a program wc specifically for word counting | ||
As explained earlier different word counting programs may give varying results depending on the text segmentation rule | ||
details The exact number of words often is not a strict requirement thus the variation is acceptable | ||
In fiction Novelist Jane Smiley suggests that length is an important quality of the novel However novels can vary | ||
tremendously in length Smiley lists novels as typically being between and words while National Novel Writing Month | ||
requires its novels to be at least words There are no firm rules for example the boundary between a novella and a novel | ||
is arbitrary and a literary work may be difficult to categorise But while the length of a novel is to a large extent up | ||
to its writer lengths may also vary by subgenre many chapter books for children start at a length of about words and a | ||
typical mystery novel might be in the to word range while a thriller could be over words | ||
The Science Fiction and Fantasy Writers of America specifies word lengths for each category of its Nebula award categories | ||
Classification Word count Novel over words Novella to words Novelette to words Short story under words | ||
In non fiction The acceptable length of an academic dissertation varies greatly dependent predominantly on the subject | ||
Numerous American universities limit Ph.D. dissertations to at most words barring special permission for exceeding this limit |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
A | ||
B | ||
A | ||
B | ||
C | ||
A | ||
B | ||
C | ||
D | ||
A |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to send the filename as part of the key? Is it used after reading from the file?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This example counts words from multiple files. In order to keep the keys unique, I included filename in the key. We can probably come up with a different way of having unique keys as well without using the filename like a file ID for example.