kappa-architecture.html

<!doctype html>
<html lang="en">

	<head>
		<meta charset="utf-8">

		<title>Kappa architecture</title>

		<meta name="description" content="A framework for easily creating beautiful presentations using HTML">
		<meta name="author" content="Hakim El Hattab">

		<meta name="apple-mobile-web-app-capable" content="yes" />
		<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />

		<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">

		<link rel="stylesheet" href="css/reveal.css">
		<link rel="stylesheet" href="css/theme/night.css" id="theme">

		<!-- For syntax highlighting -->
		<link rel="stylesheet" href="lib/css/zenburn.css">

		<!-- If the query includes 'print-pdf', use the PDF print sheet -->
		<script>
			document.write( '<link rel="stylesheet" href="css/print/' + ( window.location.search.match( /print-pdf/gi ) ? 'pdf' : 'paper' ) + '.css" type="text/css" media="print">' );
		</script>

		<!--[if lt IE 9]>
		<script src="lib/js/html5shiv.js"></script>
		<![endif]-->
	</head>

	<body>

        <div style="display: block; position: absolute; bottom: 40px; left: 50%; margin-left: -70px; z-index: 20;">
            <a href="http://www.fg.cz"><img src="img/jopenspace2013/FG_Forrest_neg.png" width="140px"/></a>
        </div>

		<div class="reveal">

			<div class="slides">
				<section data-background="img/backgrounds/black.jpg">
					<h1>Discovering Kappa Architecture the hard way</h1>
                    <p>
                        <strong>instead of learning from</strong>
                    </p>
                    <h3>
                        <a href="http://jameskinley.tumblr.com/post/37398560534/the-lambda-architecture-principles-for-architecting">The Lambda architecture</a><br/>
                        <a href="http://radar.oreilly.com/2014/07/questioning-the-lambda-architecture.html">Questioning the Lambda Architecture</a>
                    </h3>
					<p>
						<small>Honza <a href="http://www.twiter.com/novoj">@Novoj</a> Novotný</small>
					</p>
				</section>
                <section>
                    <section data-background="img/backgrounds/brown.jpg">
                        <h1>Problem introduction</h1>
                        <p>Generating <strong>click / scroll</strong> heatmaps</p>
                        <div style="text-align: center; width: 100%; clear: both">
                            <div style="height: 400px; clear: both; vertical-align: middle">
                                <img src="img/kappa/scrollmap.png" width="40%" style="margin-left: 50px"/>
                                <img src="img/kappa/clickmap.png" width="45%" style="float: left;"/>
                            </div>
                        </div>
                        <p><a href="http://www.fg.cz/?mt.openConsole=" target="_blank">DEMO</a></p>
                    </section>
                    <section data-background="img/backgrounds/big-data.jpg">
                        <h2>Constraints</h2>
                        <ul>
                            <li>big but simple structured data (time, x, y, url, viewport)</li>
                            <li>constant input stream - 24h/day</li>
                            <li>heatmaps must be returned within a second</li>
                            <li>minor data loss is acceptable</li>
                        </ul>
                    </section>
                </section>
                <section>
                    <section data-background="img/backgrounds/brown.jpg">
                        <h1>Architecture: relational access</h1>
                        <img src="img/kappa/relational-access.png" style="float: left; padding: 20px;"/>
                        <ul style="margin-top: 2em">
                            <li>MySQL - click / scroll per row</li>
                            <li>SQL group by to get aggregated data</li>
                            <li>gradients generated by client
                                <ul>
                                    <li>storage effective</li>
                                    <li>computational load moved to client</li>
                                </ul>
                            </li>
                            <li>working prototype in 24 hours</li>
                        </ul>
                    </section>
                    <section data-background="img/backgrounds/experience.jpg">
                        <h2>Observations</h2>
                        <ul>
                            <li>input stream serialization
                                <br/><span style="margin-left: 3em;">&rarr;</span>sufficient performance</li>
                            <li>generating heatmap data
                                <br/><span style="margin-left: 3em;">&rarr;</span>noticeably slowing from 500k records
                                <br/><span style="margin-left: 3em;">&rarr;</span>becoming unusable from 1m records upwards (takes seconds)</li>
                            <li>client handles maximum of thousands gradients in realtime
                                <br/><span style="margin-left: 3em;">&rarr;</span>we need to preprocess data on server
                            </li>
                            <li>algorithm for excluding uninteresting points based on DB triggers
                                <br/><span style="margin-left: 3em;">&rarr;</span>not sufficient, programmatic access necessary</li>
                        </ul>
                    </section>
                </section>
                <section>
                    <section data-background="img/backgrounds/brown.jpg">
                        <h1>Architecture: index + diff</h1>
                        <img src="img/kappa/document-access.png" style="float: left; width: 45%;"/>
                        <ul style="margin-top: 1em">
                            <li>MySQL
                                <ul>
                                    <li>record per row only for current day</li>
                                    <li>precomputed indexes for previous days / months</li>
                                    <li>clearing journal table after index computation <br/>
                                        <span style="margin-left: 3em;">&rarr;</span>we need to keep row count low
                                    </li>
                                </ul>
                            </li>
                            <li>current day computed on the fly - the old way</li>
                            <li>night jobs compute day/month indexes
                                <ul>
                                    <li><a href="https://github.com/EsotericSoftware/kryo">Kryo</a> serialized binary in MySQL BLOB</li>
                                </ul>
                            </li>
                            <li>SQL reads several rows + map / reduce</li>
                        </ul>
                    </section>
                    <section data-background="img/backgrounds/experience.jpg">
                        <h2>Observations</h2>
                        <ul>
                            <li>input stream serialization
                                <br/><span style="margin-left: 3em;">&rarr;</span>sufficient performance</li>
                            <li>querying (reducing) milions of records
                                <br/><span style="margin-left: 3em;">&rarr;</span>within 1 secs</li>
                            <li>do we need ACID properties for our task?
                                <br/><span style="margin-left: 3em;">&rarr;</span>not at all - choosing db with less guarantees might add performance boost</li>
                            <li>jobs are potential bottleneck
                                <br/><span style="margin-left: 3em;">&rarr;</span>we need to ensure that daily data are converted to indexes on regular basis
                                <br/><span style="margin-left: 3em;">&rarr;</span>when to execute jobs (time zones)?!
                                <br/><span style="margin-left: 3em;">&rarr;</span>unpredictable load peaks or data processing delays
                                <br/><span style="margin-left: 3em;">&rarr;</span>how to repair incorrect indexes?</li>
                        </ul>
                    </section>
                    <section data-background="img/backgrounds/brown.jpg">
                        <h2>Lambda Architecture</h2>
                        <p><a href="http://lambda-architecture.net/">lambda-architecture.net</a></p>
                        <img src="http://lambda-architecture.net/img/la-overview_small.png" width="47%"/>
                        <ul style="margin-top: 1em; width: 47%; vertical-align: top">
                            <li><strong>Input:</strong>
                                    <a href="http://en.wikipedia.org/wiki/Java_Message_Service" target="_blank">JMS</a>,
                                    <a href="http://kafka.apache.org/" target="_blank">Kafka</a>,
                                    <a href="https://github.com/twitter/kestrel" target="_blank">Kestrel</a>
                                    and others
                            </li>
                            <li>
                                <strong>Batch layer:</strong>
                                <a href="http://hadoop.apache.org/docs/stable/api/org/apache/hadoop/mapreduce/package-summary.html" target="_blank">Hadoop MapReduce</a>,
                                <a href="https://spark.apache.org/docs/latest/" target="_blank">Spark</a>,
                                <a href="http://pig.apache.org/" target="_blank">Pig</a>,
                                and <a href="http://lambda-architecture.net/components/2014-06-30-batch-components/" target="_blank">others</a>
                            </li>
                            <li>
                                <strong>Speed layer:</strong>
                                <a href="http://storm-project.net/" target="_blank">Storm</a>,
                                <a href="http://samza.incubator.apache.org/" target="_blank">Samza</a>,
                                <a href="http://projects.spring.io/spring-xd/" target="_blank">Spring XD</a>,
                                and <a href="http://lambda-architecture.net/components/2014-06-30-speed-components/" target="_blank">others</a>
                            </li>
                            <li>
                                <strong>Serving layer:</strong>
                                <a href="http://druid.io/" target="_blank">Druid</a>,
                                <a href="http://www.project-voldemort.com/voldemort/" target="_blank">Voldemort</a>,
                                <a href="http://sploutsql.com/" target="_blank">Splout SQL</a>,
                                and <a href="http://lambda-architecture.net/components/2014-06-30-serving-components/" target="_blank">others</a>
                            </li>
                        </ul>
                    </section>
                    <section data-background="img/backgrounds/brown.jpg">
                        <h2>Lambda Architecture example</h2>
                        <p>
                            <a href="http://www.datasalt.com/2013/01/an-example-lambda-architecture-using-trident-hadoop-and-splout-sql/" target="_blank">count hashtag appearances in tweets by day / hour</a><br/>
                           <a href="http://lambda-architecture.net/">lambda-architecture.net</a>
                        </p>
                        <img src="https://raw.github.com/pereferrera/trident-lambda-splout/master/TridentSploutArch-medium-numbered.png" width="35%"/>
                        <ol style="width: 47%; padding-left: 1em; vertical-align: top; padding-top: 1em">
                            <li>Tweets are ingested from Kafka</li>
                            <li>Trident (STORM) saves data to HDFS<br/>
                                Trident (STORM) computes counts and stores them in memory</li>
                            <li>Hadoop MapReduce procesess files on HDFS and generates others with counts of hashtags by date</li>
                            <li>SploutSQL indexes file with counts and deploys it to the SploutSQL cluster</li>
                            <li>Trident (STORM - <a href="https://storm.apache.org/documentation/Distributed-RPC.html" target="_blank">DRPC</a>) handles queries by combining suqueries to memory state and SploutSQL indexes</li>
                        </ol>
                    </section>
                    <section data-background="img/backgrounds/brown.jpg">
                        <h2 style="font-size: 2.5em">Questioning Lambda Architecture</h2>
                        <a href="http://radar.oreilly.com/2014/07/questioning-the-lambda-architecture.html" target="_blank"><img src="img/kappa/linkedin_logo_11.jpg" width="250px"/></a>
                        <div>
                            <div style="float: left; width: 48%">
                                <h3>Pros</h3>
                                <ul style="padding-top: 1em;">
                                    <li>keeping original data log enables reprocessing original data in case of bug introduction or algorithm evolution</li>
                                    <li>beats CAP theorem by combining multiple systems with different tradeoffs?!? <em>#probablyNot</em></li>
                                </ul>
                            </div>
                            <div style="float: left; width: 48%">
                                <h3>Cons</h3>
                                <ul style="padding-top: 1em">
                                    <li>you need to implement application logic twice &rarr; Hadoop MapReduce jobs + Trident (STORM) implementation <em>#costly #hardToMaintain #bugProne</em></li>
                                    <li>you may use abstraction (<a href="https://github.com/twitter/summingbird" target="_blank">SummingBird</a> for example) but you will operate on least common denominator <em>#anotherLevelOfAbstraction</em></li>
                                    <li>anyway it requires deep knowledge of both subsystems - realtime / batch</li>
                                </ul>
                            </div>
                        </div>
                    </section>
                </section>
                <section>
                    <section data-background="img/backgrounds/brown.jpg">
                        <h1>Architecture: streaming access</h1>
                        <img src="img/kappa/streaming-access.png" style="float: left; width: 47%;"/>
                        <ul style="margin-top: 0.5em; width: 47%">
                            <li>Mongo DB instead of MySQL<br/>
                                replicated cluster (write/read node) + arbiter on balancer</li>
                            <li>chunked flat files = journal<br/>
                                journal ZIPped and backed up</li>
                            <li>indexes for day / month computed on the fly<br/>
                                merged with MongoDB index on <a href="http://ehcache.org/">EhCache</a> evict</li>
                            <li><a href="https://github.com/EsotericSoftware/kryo">Kryo</a> serialized blobs in Mongo DB binary field<br/>
                                storage and network effective, must be updated as a whole</li>
                            <li>querying several documents + live EhCache index<br/>&rarr; map / reduce</li>
                            <li>unified processing logic</li>
                            <li>no nightly jobs<br/>cache evict distributes batch updates through all the time</li>
                        </ul>
                    </section>
                    <section data-background="img/backgrounds/experience.jpg">
                        <h2>Observations</h2>
                        <ul>
                            <li>input stream serialization
                                <br/><span style="margin-left: 3em;">&rarr;</span>performance 3k reqs/sec</li>
                            <li>handles milions of records query
                                <br/><span style="margin-left: 3em;">&rarr;</span>within 0,5 secs</li>
                            <li>runs pretty well on commodity HW
                                <br/><span style="margin-left: 3em;">&rarr;</span>several hundreds CZK/month</li>
                            <li>better scaling possibility
                                <br/><span style="margin-left: 3em;">&rarr;</span>reading from secondaries
                                <br/><span style="margin-left: 3em;">&rarr;</span>sharding</li>
                            </li>
                            <li>algorithm evolution
                                <br/><span style="margin-left: 3em;">&rarr;</span>replay tool can easily reprocess files from journal via original streaming API</li>
                        </ul>
                    </section>
                    <section data-background="img/backgrounds/experience.jpg">
                        <h2>Performance testing</h2>
                        <div style="width: 43%; height: 43%; float: left; text-align: center; font-size: 0.5em; line-height: 1em">
                            <img src="img/kappa/hits.png" alt="Hits per second"/><br/>
                            Hits per second
                        </div>
                        <div style="width: 43%; height: 43%; float: left; text-align: center; font-size: 0.5em; line-height: 1em">
                            <img src="img/kappa/traffic.png" alt="Traffic in bytes"/><br/>
                            Traffic in bytes
                        </div>
                        <div style="width: 43%; height: 43%; float: left; text-align: center; font-size: 0.5em; line-height: 1em">
                            <img src="img/kappa/cpu-load.png" alt="CPU"/><br/>
                            CPU load
                        </div>
                        <div style="width: 43%; height: 43%; float: left; text-align: center; font-size: 0.5em; line-height: 1em">
                            <img src="img/kappa/memory.png" alt="memory"/><br/>
                            Memory
                        </div>
                    </section>
                    <section data-background="img/backgrounds/experience.jpg">
                        <h2>Current database size</h2>
                        <p>no BIG data yet, no SMALL data already</p>
                        <div style="float: left; width: 48%">
                        <h3>MongoDB stats</h3>
                        <pre><code class="hljs json" data-trim>
{
    "db" : "monkeyTracker",
    "objects" : 3908201,
    "avgObjSize" : 395.4592550383156,
    "dataSize" : 1545534256,
    "storageSize" : 1913036800,
    "indexSize" : 756549808,
    "fileSize" : 4226809856,
}
                        </code></pre>
                        </div>
                        <div style="float: left; width: 48%">
                            <h3>Records processed since 11/2014</h3>
                            <table>
                                <thead>
                                    <tr>
                                        <th>Month</th>
                                        <th style="text-align: right">Clicks</th>
                                        <th style="text-align: right">Scrolls</th>
                                    </tr>
                                </thead>
                                <tbody>
                                    <tr>
                                        <td>November</td>
                                        <td style="text-align: right">4,641,660</td>
                                        <td style="text-align: right">2,668,661</td>
                                    </tr>
                                    <tr>
                                        <td>December</td>
                                        <td style="text-align: right">8,016,352</td>
                                        <td style="text-align: right">3,940,576</td>
                                    </tr>
                                    <tr>
                                        <td>January</td>
                                        <td style="text-align: right">8,088,716</td>
                                        <td style="text-align: right">4,557,283</td>
                                    </tr>
                                    <tr>
                                        <td>February</td>
                                        <td style="text-align: right">9,759,176</td>
                                        <td style="text-align: right">5,012,504</td>
                                    </tr>
                                    <tr>
                                        <td>Total</td>
                                        <td style="text-align: right">33,931,572</td>
                                        <td style="text-align: right">17,402,555</td>
                                    </tr>
                                </tbody>
                            </table>
                        </div>
                    </section>
                    <section data-background="img/backgrounds/brown.jpg">
                        <h1>Kappa Architecture</h1>
                        <p>
                            <a href="http://radar.oreilly.com/2014/07/questioning-the-lambda-architecture.html">Questioning the Lambda Architecture (LinkedIn)</a><br/>
                            <a href="http://www.kappa-architecture.com/">www.Kappa-Architecture.com</a>
                        </p>
                        <img src="http://s.radar.oreilly.com/wp-files/2/2014/06/kappa.png" width="48%"/>
                        <ul style="vertical-align: top; margin-top: 1em">
                            <li>Input: <a href="http://kafka.apache.org/" target="_blank">Kafka</a> ...</li>
                            <li>Processing: <a href="http://samza.apache.org/" target="_blank">Samza</a> ...</li>
                            <li>Serving: <a href="http://druid.io/" target="_blank">Druid</a> ...</li>
                        </ul>
                        <p>Not without problems ...</p>
                        <ul style="vertical-align: top; margin-top: 1em">
                            <li>exactly once strategy</li>
                            <li>connectors</li>
                            <li>maturity</li>
                        </ul>
                    </section>
                </section>
                <section data-background="img/backgrounds/black.jpg">
                    <h2>Try <a href="https://www.monkeytracker.cz">MonkeyTracker</a> on your own!</h2>
                    <img src="img/kappa/mtreg-monkey.png" alt="MonkeyTracker" style="background: transparent; border: none"/>
                    <p>
                        <small>Honza Novotný, FG Forrest</small>
                        <br/>
                        <small><a href="http://www.twitter.com/novoj">@novoj</a></small>
                        <br/>
                        <small><a href="http://blog.novoj.net">http://blog.novoj.net</a></small>
                    </p>
                </section>
			</div>

		</div>

		<script src="lib/js/head.min.js"></script>
		<script src="js/reveal.js"></script>
        <script src="js/jquery.min.js"></script>

		<script>

			// Full list of configuration options available here:
			// https://github.com/hakimel/reveal.js#configuration
			Reveal.initialize({
				controls: true,
				progress: true,
				history: true,
				center: true,

                width: 1200,
                height: 900,

				theme: Reveal.getQueryHash().theme, // available themes are in /css/theme
				transition: Reveal.getQueryHash().transition || 'default', // default/cube/page/concave/zoom/linear/fade/none

				// Optional libraries used to extend on reveal.js
				dependencies: [
					{ src: 'lib/js/classList.js', condition: function() { return !document.body.classList; } },
					{ src: 'plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
					{ src: 'plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
					{ src: 'plugin/highlight/highlight.js', async: true, callback: function() { hljs.initHighlightingOnLoad(); } },
					{ src: 'plugin/zoom-js/zoom.js', async: true, condition: function() { return !!document.body.classList; } },
					{ src: 'plugin/notes/notes.js', async: true, condition: function() { return !!document.body.classList; } }
				]
			});
		</script>

	</body>
</html>