diff --git a/_data/authors.yml b/_data/authors.yml index 4a28b6b678..7c4267fa93 100644 --- a/_data/authors.yml +++ b/_data/authors.yml @@ -59,6 +59,7 @@ active-authors: - jharris - jhenderson - jheward + - jisted - jleftley - jmcevoy - jporter @@ -154,12 +155,6 @@ authors: name: "Alex Segrove" picture: picture.jpg author-summary: "Hi, Im Alex - Delivery Manager based in the London office. I spent the last few years in the Government Digital Service heading up delivery for GOV.UK, Digital Identity and Government Data policy, then worked with governments around the world to help them with their own digital transformation." - bpritchard: - name: "Beth Pritchard" - email: bpritchard@scottlogic.com - author-summary: "A developer at Scott Logic, based in Newcastle. Interested in all things cloud. Partial to loud music, video games, and hanging out with my cat. (they/them)" - twitter-handle: null - picture: bpritchard.jpg aaspellc: name: "Andy Aspell-Clark" author-summary: "

I am a Senior Developer at Scott Logic. I am interested in all programming topics from how a computer goes from power on to displaying windows on the screen or how a CPU handles branch prediction to how to write a mobile UI using kotlin or cordova.

" @@ -252,6 +247,11 @@ authors: bmumford: name: "Ben Mumford" picture: picture.jpg + bpritchard: + name: "Beth Pritchard" + email: bpritchard@scottlogic.com + author-summary: "A developer at Scott Logic, based in Newcastle. Interested in all things cloud. Partial to loud music, video games, and hanging out with my cat." + picture: bpritchard.jpg cburbidge: name: "Chester Burbidge" author-summary: "Software Engineer at Scott Logic. Experienced in dotnet but interested in all areas of software development, especially 'devops' and functional programming." @@ -296,7 +296,8 @@ authors: picture: picture.png csalt: name: "Caitlin Salt" - author-summary: "I'm a Lead Developer at Scott Logic, based in our Bristol office. My interests are varied and wide-ranging, but I tend to specialise in .NET and cloud-hosted technologies. When I'm not at a keyboard, I get passionate about almost any sort of outdated and vintage machinery." + author-summary: "I'm a Technical Principal at Scott Logic, based in our Bristol office. My interests are varied and wide-ranging, but I tend to specialise in .NET and cloud-hosted technologies. When I'm not at a keyboard, I get passionate about almost any sort of outdated and vintage machinery." + feed-description: "Blog posts by Caitlin Salt, Technical Principal at Scott Logic" picture: picture.jpg cstephen: name: "Craig Stephen" @@ -1430,7 +1431,7 @@ authors: picture: yblowers.jpeg lperrett: name: "Lisa Perrett" - author-summary: "I'm a Senior Test Engineer, based in our Newcastle office" + author-summary: "I'm a Senior Test Engineer, based in our Newcastle office. My other key interests are in Artificial Intelligence and Psychology." picture: lisa-profile.png lvincent: name: "Luke Vincent" @@ -1465,3 +1466,8 @@ authors: email: wmckenzie@scottlogic.com author-summary: "Senior developer based in the Newcastle office with experience of front and backend development across web and mobile as well as DevOps and automation." picture: wmckenzie.jpg + jisted: + name: "Josh Isted" + email: jisted@scottlogic.com + author-summary: "Originally from a humanities and languages background, I'm now a Senior Developer at Scott Logic based in the Newcastle office, with an interest in a bit of everything tech-wise. Outside of that I like going to other countries trying to eat all of their food." + picture: jisted.jpg diff --git a/_data/related.yml b/_data/related.yml index 306b967a74..385dc2a85a 100644 --- a/_data/related.yml +++ b/_data/related.yml @@ -50,8 +50,8 @@ - /2013/06/07/scala-macros-part-3.html - /2013/06/05/scala-macros-part-1.html /2013/06/07/isolated-scrum-does-scrum-need-to-be-complemented-with-extreme-programming-to-succeed.html: - - /2014/05/08/rapid-software-testing.html - /2013/01/04/codeproject-article-on-xamarin-cross-platform-development.html + - /2013/01/07/codeproject-mvp-2013.html /2013/06/07/scala-macros-part-3.html: - /2013/06/17/scala-macros-part-4.html - /2013/06/05/scala-macros-part-1.html @@ -155,8 +155,8 @@ - /2013/01/04/codeproject-article-on-xamarin-cross-platform-development.html - /2013/01/07/codeproject-mvp-2013.html /2014/05/08/rapid-software-testing.html: - - /2013/06/07/isolated-scrum-does-scrum-need-to-be-complemented-with-extreme-programming-to-succeed.html - /2013/01/04/codeproject-article-on-xamarin-cross-platform-development.html + - /2013/01/07/codeproject-mvp-2013.html /2014/05/11/reactivecocoa-tableview-binding.html: - /2014/11/04/mutable-array-binding-reactivecocoa.html - /2014/07/07/mvvm-reactive-cocoa.html @@ -251,8 +251,8 @@ - /2014/08/01/swift-vs-objectivec.html - /2013/09/19/ios-book-published.html /2014/09/18/the-agile-mindset.html: + - /2024/04/17/is-agile-the-answer.html - /2017/05/17/what-it-means-to-be-agile.html - - /2018/08/16/sharing-our-experience-migrating-to-agile-mid-project.html /2014/09/19/d3-svg-chart-performance.html: - /2014/08/19/an-ohlc-chart-component-for-d3.html - /2014/09/26/an-interactive-stock-comparison-chart-with-d3.html @@ -912,7 +912,7 @@ - /2022/09/12/how-data-has-improved-the-amateur-runner.html /2017/02/28/relogic.html: - /2017/03/09/relogic-2.html - - /2018/06/11/pros-cons-of-redux-form.html + - /2020/05/01/demystifying-react-redux.html /2017/03/01/cassandra-vs-mariadb.html: - /2017/03/20/cassandra-vs-mariadb-scaling.html - /2017/10/06/cassandra-eventual-consistency.html @@ -953,8 +953,8 @@ - /2017/07/16/clabot.html - /2017/05/22/gifbot-github-integration.html /2017/05/15/insurance-a-customers-perspective.html: + - /2024/03/27/ecosystems-over-user-experience-redefining-priorities-in-financial-services.html - /2015/05/08/making-technology-disappear.html - - /2015/05/01/state-of-in-car-uis.html /2017/05/17/what-it-means-to-be-agile.html: - /2018/08/16/sharing-our-experience-migrating-to-agile-mid-project.html - /2019/09/23/agile-is-not-a-solved-problem.html @@ -1220,8 +1220,8 @@ - /2022/07/04/data_ai.html - /2017/02/07/the-rise-of-big-data-streaming.html /2018/03/20/agile-monster-hunting.html: + - /2024/04/17/is-agile-the-answer.html - /2014/09/18/the-agile-mindset.html - - /2018/08/16/sharing-our-experience-migrating-to-agile-mid-project.html /2018/03/20/mock-the-backend-with-node.html: - /2016/02/08/data-mocking.html - /2014/08/06/angular-knockout-e2e-testing.html @@ -1241,8 +1241,8 @@ - /2017/06/06/does-npm5-deprecate-yarn.html - /2016/03/03/modularizing-client-side-dependencies-with-jspm.html /2018/04/12/qa-qa-who-the-funk.html: - - /2013/06/07/isolated-scrum-does-scrum-need-to-be-complemented-with-extreme-programming-to-succeed.html - - /2014/05/08/rapid-software-testing.html + - /2013/01/04/codeproject-article-on-xamarin-cross-platform-development.html + - /2013/01/07/codeproject-mvp-2013.html /2018/04/16/a-hire-standard.html: - /2018/09/27/thoughts-from-the-software-testing-conference-north.html - /2018/03/08/international-women-s-day-being-a-tester-at-scott-logic.html @@ -1298,8 +1298,8 @@ - /2016/05/19/redux-reducer-arrays.html - /2018/10/22/ruining-react-s-performance-with-redux.html /2018/06/19/step-functions.html: + - /2024/04/19/monitoring-codebuild-build-status.html - /2019/07/29/from-monolith-to-serverless-on-aws.html - - /2019/03/04/lambda-global-state.html /2018/06/26/The-3-Types-of-Minimum-Viable-Products.html: - /2017/05/17/what-it-means-to-be-agile.html - /2016/09/08/why-can-no-one-agree-on-a-definition-of-mvc.html @@ -1323,7 +1323,7 @@ - /2016/03/09/lonely-tester.html /2018/07/17/achieving-cross-browser-testing-greatness.html: - /2020/01/13/selenium-vs-puppeteer.html - - /2019/07/03/Demystifying-test-automation.html + - /2024/04/03/will-it-automate-accessibility-testing.html /2018/07/17/the-11k-gas-bill-customer-satisfaction-and-improved-interactions.html: - /2023/12/08/conscientious-computing-accurately-measuring-the-energy-consumption-of-hardware.html - /2020/08/24/energy-transition-q3.html @@ -1697,8 +1697,8 @@ - /2023/07/13/accessibility-considerations.html - /2020/07/02/6-most-common-accessibility-problems.html /2019/09/23/agile-is-not-a-solved-problem.html: + - /2024/04/17/is-agile-the-answer.html - /2017/05/17/what-it-means-to-be-agile.html - - /2014/09/18/the-agile-mindset.html /2019/10/03/how-to-make-3d-charts-for-the-web-using-d3-and-x3dom.html: - /2020/05/01/rendering-one-million-points-with-d3.html - /2015/09/03/d3-without-d3.html @@ -1811,8 +1811,8 @@ - /2020/01/09/git-rebasing.html - /2019/12/19/source-control-when.html /2020/01/08/What-Improv-can-teach-us-about-good-code-design.html: + - /2024/04/17/is-agile-the-answer.html - /2017/05/17/what-it-means-to-be-agile.html - - /2019/04/30/ten-commandments-of-agile.html /2020/01/08/creating-a-chart-with-d3fc-and-webgl.html: - /2020/01/16/performance-of-a-d3fc-webgl-chart.html - /2020/05/01/rendering-one-million-points-with-d3.html @@ -2357,8 +2357,8 @@ - /2020/02/28/is-aeron-a-good-choice-for-a-messaging-solution.html - /2022/11/15/diagnosing-slow-performance-in-a-microservice-pipeline.html /2021/12/08/narrative-dashboard.html: + - /2024/04/08/how-to-make-chatgpt-do-anything-you-want.html - /2023/07/05/gpt3-creativity-from-determinism.md.html - - /2023/06/01/generative-terminology.html /2021/12/09/Performance-Testing-with-JMeter.html: - /2017/06/22/a-developer's-test.html - /2019/07/03/Demystifying-test-automation.html @@ -2556,7 +2556,7 @@ - /2016/06/01/An-update-on-Electron-Security.html /2023/02/06/beyond-the-hype-chatgpt.html: - /2023/05/02/beyond-the-hype-is-generative-ai-coming-for-programming-jobs.html - - /2023/11/03/spy-logic.html + - /2024/04/08/how-to-make-chatgpt-do-anything-you-want.html /2023/02/07/state-of-open-con.html: - /2021/10/21/financial-services-open-source.html - /2023/11/22/balancing-act-financial-organizations-and-the-constraints-on-open-source-community-engagement.html @@ -2582,8 +2582,8 @@ - /2023/03/13/beyond-the-hype-quantum-computing-part-one.html - /2022/12/15/beyond-the-hype-blockchain-is-dead-long-live-blockchain.html /2023/04/12/catching-up-with-openai.html: + - /2024/04/08/how-to-make-chatgpt-do-anything-you-want.html - /2021/12/08/narrative-dashboard.html - - /2021/08/31/a-primer-on-the-openai-api-1.html /2023/05/02/beyond-the-hype-is-generative-ai-coming-for-programming-jobs.html: - /2023/02/06/beyond-the-hype-chatgpt.html - /2023/03/31/the-new-ai-platform.html @@ -2591,8 +2591,8 @@ - /2023/03/31/the-new-ai-platform.html - /2023/07/26/how-we-de-risked-a-genai-chatbot.html /2023/05/04/langchain-mini.html: + - /2024/04/08/how-to-make-chatgpt-do-anything-you-want.html - /2023/10/23/building_scottbot_an_ai_chatbot_for_scott_logic.html - - /2023/11/14/convincing-langchain.html /2023/05/30/the-product-owner-role.html: - /2023/03/30/im-no-longer-a-tester-im-a-checker-or-am-i.html - /2023/06/26/peer-review-tips.html @@ -2604,7 +2604,7 @@ - /2023/06/20/jsnation-conference-2023.html /2023/06/01/generative-terminology.html: - /2023/03/31/the-new-ai-platform.html - - /2021/08/31/a-primer-on-the-openai-api-1.html + - /2024/04/08/how-to-make-chatgpt-do-anything-you-want.html /2023/06/02/what-pride-and-allyship-mean-to-me.html: - /2022/06/16/pride-2022.html - /2019/08/06/why-scott-logic-celebrated-pride-month.html @@ -2624,8 +2624,8 @@ - /2023/03/30/im-no-longer-a-tester-im-a-checker-or-am-i.html - /2023/06/20/using-the-very-app-we-created-graduate-project-2023.html /2023/06/29/hands-on-example-of-chatgpt-as-a-programming-tool.html: + - /2024/04/08/how-to-make-chatgpt-do-anything-you-want.html - /2018/03/20/mock-the-backend-with-node.html - - /2023/11/03/spy-logic.html /2023/06/29/making-digital-products-accessible-insights-from-a-panel-discussion.html: - /2023/07/13/accessibility-considerations.html - /2019/09/11/Designing-with-accessibility-in-mind.html @@ -2705,8 +2705,8 @@ - /2016/03/09/lonely-tester.html - /2019/02/06/tester-s-guide-during-agile-transition.html /2023/09/27/accessibility-tooling-wave-vs-axe.html: + - /2024/04/03/will-it-automate-accessibility-testing.html - /2020/07/02/6-most-common-accessibility-problems.html - - /2019/09/11/Designing-with-accessibility-in-mind.html /2023/09/27/architecting-a-regenerative-future-thoughts-from-intersection23.html: - /2018/02/14/interaction18.html - /2016/03/08/interaction16.html @@ -2754,7 +2754,7 @@ - /2023/07/13/accessibility-considerations.html /2023/11/06/testing-with-intent-a-technical-view.html: - /2023/11/06/testing-with-intent-a-path-to-embedded-accessibility.html - - /2020/07/21/7-tips-for-transforming-testing.html + - /2024/04/03/will-it-automate-accessibility-testing.html /2023/11/06/working-in-a-blameless-culture.html: - /2023/08/17/celebrating-failure.html - /2018/03/20/agile-monster-hunting.html @@ -2768,8 +2768,8 @@ - /2023/10/23/building_scottbot_an_ai_chatbot_for_scott_logic.html - /2023/11/03/spy-logic.html /2023/11/14/data-distribution.html: + - /2024/04/22/message_types.html - /2018/04/17/comparing-big-data-messaging.html - - /2023/11/07/understand-your-data-requirements.html /2023/11/14/testing-LLM-based-applications-strategy-and-challenges.html: - /2023/12/18/implementing-cost-effective-test-driven-development-in-an-llm-application.html - /2023/10/23/building_scottbot_an_ai_chatbot_for_scott_logic.html @@ -2778,7 +2778,7 @@ - /2023/03/31/the-new-ai-platform.html /2023/11/16/OpenAI-Outage-November-2023.html: - /2023/10/23/building_scottbot_an_ai_chatbot_for_scott_logic.html - - /2023/07/26/how-we-de-risked-a-genai-chatbot.html + - /2024/04/08/how-to-make-chatgpt-do-anything-you-want.html /2023/11/16/using-the-t-test-for-effective-testing-of-nondeterministic-AI-systems.html: - /2023/12/18/implementing-cost-effective-test-driven-development-in-an-llm-application.html - /2023/11/14/testing-LLM-based-applications-strategy-and-challenges.html @@ -2801,8 +2801,8 @@ - /2018/03/13/tensorflow-machine-learning.html - /2018/10/25/a-developers-intro-tensorflow-and-keras.html /2023/12/08/conscientious-computing-accurately-measuring-the-energy-consumption-of-hardware.html: + - /2024/04/05/carbon-emissions-of-end-user-devices-part-one.html - /2023/10/19/tools-for-measuring-cloud-carbon-emissions.html - - /2023/11/23/conscientious-cloud-pick-your-cloud-region-deliberately.html /2023/12/13/diffedit.html: - /2021/08/31/a-primer-on-the-openai-api-1.html - /2021/09/02/a-primer-on-the-openai-api-3.html @@ -2838,7 +2838,7 @@ - /2023/02/06/beyond-the-hype-chatgpt.html /2024/01/22/beyond-the-hype-was-the-threat-the-cra-seemed-to-pose-to-open-source-just-hype.html: - /2024/02/19/beyond-the-hype-can-technology-sustainability-really-make-a-difference.html - - /2023/06/20/using-the-very-app-we-created-graduate-project-2023.html + - /2024/04/18/beyond-the-hype-are-data-mesh-and-data-fabric-just-marchitecture.html /2024/01/23/reflections-from-ukgovcamp-xl.html: - /2024/03/22/insights-from-the-government-digital-sustainability-alliance-gdsa-2024-summit.html - /2019/03/22/sdig19.html @@ -2853,7 +2853,7 @@ - /2024/03/15/technology-carbon-standard-update.html /2024/02/19/beyond-the-hype-can-technology-sustainability-really-make-a-difference.html: - /2024/01/22/beyond-the-hype-was-the-threat-the-cra-seemed-to-pose-to-open-source-just-hype.html - - /2019/12/13/edinburgh-open-source-in-fintech-event-series-dec-2019.html + - /2024/04/18/beyond-the-hype-are-data-mesh-and-data-fabric-just-marchitecture.html /2024/02/21/how-the-tables-turned-my-life-with-spylogic.html: - /2023/11/03/spy-logic.html - /2023/11/14/testing-LLM-based-applications-strategy-and-challenges.html @@ -2882,8 +2882,47 @@ - /2023/07/07/reset-connect-conference-2023.html - /2023/11/09/the-sustainable-computing-ecosystem.html /2024/03/26/cloud-business-intelligence-a-comparative-analysis.html: + - /2024/04/23/are-we-ready-to-put-ai-in-the-hands-of-business-users.html - /2020/04/09/how-to-connect-power-bi-with-azure-devops-and-jira.html - - /2018/10/23/automating-terraform-abstracting-terraform-behind-a-web-service.html /2024/03/26/generating-d3-code-with-llms.html: + - /2024/04/08/how-to-make-chatgpt-do-anything-you-want.html - /2023/10/23/building_scottbot_an_ai_chatbot_for_scott_logic.html - - /2015/09/15/internProject2015.html \ No newline at end of file +/2024/03/27/ecosystems-over-user-experience-redefining-priorities-in-financial-services.html: + - /2020/09/10/open-your-eyes-to-apis.html + - /2020/09/30/power-moves.html +/2024/04/03/will-it-automate-accessibility-testing.html: + - /2023/09/27/accessibility-tooling-wave-vs-axe.html + - /2018/07/17/achieving-cross-browser-testing-greatness.html +/2024/04/05/carbon-emissions-of-end-user-devices-part-one.html: + - /2023/09/12/sustainability-terminology.html + - /2023/10/19/tools-for-measuring-cloud-carbon-emissions.html +/2024/04/08/how-to-make-chatgpt-do-anything-you-want.html: + - /2023/10/23/building_scottbot_an_ai_chatbot_for_scott_logic.html + - /2023/11/16/OpenAI-Outage-November-2023.html +/2024/04/11/Carbon-Hack-24--Leveraging-the-Impact Framework-to-Estimate-the-Carbon-Cost-of-Cloud-Storage.html: + - /2023/10/19/tools-for-measuring-cloud-carbon-emissions.html + - /2024/04/05/carbon-emissions-of-end-user-devices-part-one.html +/2024/04/15/how-cxos-can-spot-technology-greenwashing.html: + - /2024/02/13/announcing-the-proposed-technology-carbon-standard.html + - /2024/03/06/technology-carbon-standard-update.html +/2024/04/15/the-changing-world-in-life-it-software-testing-and-artifical-intelligence.html: + - /2014/08/11/a-piecemeal-approach-to-introducing-agile.html + - /2019/03/22/a-tester-s-story-of-adapting-to-the-new-world-of-pipelines-ci-cd.html +/2024/04/17/is-agile-the-answer.html: + - /2019/09/23/agile-is-not-a-solved-problem.html + - /2014/09/18/the-agile-mindset.html +/2024/04/18/beyond-the-hype-are-data-mesh-and-data-fabric-just-marchitecture.html: + - /2024/01/22/beyond-the-hype-was-the-threat-the-cra-seemed-to-pose-to-open-source-just-hype.html + - /2024/02/19/beyond-the-hype-can-technology-sustainability-really-make-a-difference.html +/2024/04/19/monitoring-codebuild-build-status.html: + - /2019/07/29/from-monolith-to-serverless-on-aws.html + - /2018/06/19/step-functions.html +/2024/04/22/going-from-2.4-to-3.5.html: + - /2016/01/24/apache-spark-intro.html + - /2022/07/04/data_ai.html +/2024/04/22/message_types.html: + - /2023/11/14/data-distribution.html + - /2018/04/17/comparing-big-data-messaging.html +/2024/04/23/are-we-ready-to-put-ai-in-the-hands-of-business-users.html: + - /2023/03/31/the-new-ai-platform.html + - /2016/06/16/catch22-of-conversational-uis.html \ No newline at end of file diff --git a/_posts/2024-04-03-will-it-automate-accessibility-testing.md b/_posts/2024-04-03-will-it-automate-accessibility-testing.md index f357badfcf..f5f8a24af9 100644 --- a/_posts/2024-04-03-will-it-automate-accessibility-testing.md +++ b/_posts/2024-04-03-will-it-automate-accessibility-testing.md @@ -7,9 +7,10 @@ tags: - accessibility - automation - testing -author: wmckenzie summary: Want to make sure your application is accessible? Here are some different ways you can use your automated tests to make sure. +author: wmckenzie +image: "/uploads/Will%20it%20automate_%20thumbnail.png" --- I'm sure we've all been there, you've completed all your features, testers and product owners have signed them off, all critical bugs are resolved and you're ready for production. You've even passed PEN testing! There's just one last hurdle you've got to overcome: accessibility testing. It should be fine, right? You added alt text to your images and linked your labels with your inputs, you've got it covered... and then the report comes back. diff --git a/_posts/2024-04-08-how-to-make-chatgpt-do-anything-you-want.md b/_posts/2024-04-08-how-to-make-chatgpt-do-anything-you-want.md new file mode 100644 index 0000000000..f9d29d13ea --- /dev/null +++ b/_posts/2024-04-08-how-to-make-chatgpt-do-anything-you-want.md @@ -0,0 +1,158 @@ +--- +title: How to make ChatGPT do anything you want +date: 2024-04-08 00:00:00 Z +categories: +- colive +- Artificial Intelligence +tags: +- ChatGPT +- Large Language Models +summary: Using an example project this article explores the possibilities of integrating + a Large Language Model with custom logic to improve its utility and power. +author: colive +image: "/uploads/how%20to%20make%20chatgpt%20do%20anything%20you%20want.png" +layout: default_post +--- + +## Introduction + +OpenAI released an early demo of ChatGPT on the 30th of November 2022. Since then, it has propelled the general public’s awareness of the power of Large Language Models (LLM). In other words, ChatGPT was the first viral AI-powered ChatBot. These Generative Pre-Trained Transformers take a prompt (a block of written text with no other context) and will attempt at producing a coherent written response. GPT models are trained on conversation data to learn the most appropriate response to whatever new prompt they are served, but it should be noted that the training data they have been built from is all they know. They are often compared to parrots due to this behavioural trait, echoing previously seen conversations to construct sensible responses. + +By itself a tool that can do this is very powerful. It's currently commonplace on social media to see posts around using models like GPT-3.5 (the current ChatGPT model used "under the hood") to generate templates for emails, social media posts, poems, birthday party ideas and all sorts of other supposedly creative ideas. However, the current limitation of most of these models is integration into other systems and knowledge bases. A pure LLM won't know the result of the football game last week or what Elon Musk last posted. Achieving this is something the large tech companies are actively working on, but it can also be achieved by ourselves with very little code! + +In this article I will be using my personal project [PunditBot](https://github.com/Ch0live/pundit-bot) as an example of how this can work, but the concept can be applied to anything. To keep things as simple as possible, let’s talk about achieving the goal we mentioned before - finding out the result of the football game last week. + +## Using OpenAI's API + +Currently if we ask ChatGPT about this it admits it doesn't have access to current data around football games - as we've established the model doesn't actively search the web for data to integrate into its response. + +![Asking ChatGPT about the game]({{ site.github.url }}/colive/assets/askingChatGPT.png "Asking ChatGPT about the game") + +We need to build new logic around the call to OpenAI's GPT model. Fortunately, OpenAI also offers requests straight to their model through an API, billed to your account (billing is done by the number of tokens used in each prompt and generated in each response, where a token can be thought of as a word or part of a word). Now we can write code (here is a short python script as an example stolen directly from [OpenAI's Quickstart tutorial](https://platform.openai.com/docs/quickstart?context=python)) to call the API with our prompt. + +~~~python +from openai import OpenAI +client = OpenAI() + +completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are PunditBot, aimed at reporting on football games"}, + {"role": "user", "content": "Who won the man city arsenal game last weekend?"} + ] +) + +print(completion.choices[0].message) +~~~ + +When using the `/v1/chat/completions` endpoint you can specify both a system prompt (response context for the LLM) and a user prompt (the actual prompt given, often called utterance). + +## Building from others' work + +As with all programming endeavours, let's aim not to re-invent the wheel and use an example project to use as a template for our own. In PunditBot I used [the docker genai-stack project](https://github.com/docker/genai-stack). Note that there are many other repositories that can be great for starting a project like this from, this is just the one I chose. + +This repository contains other interesting tools that utilise an LLM model, but most importantly it has a Vite front end application connected to a FastAPI backend, with configuration we can plug our new OpenAI API key into. + +So now we have our own version of ChatGPT - a chat bot interface that uses an LLM for generated answers. A simplified diagram of the app architecture would look like this. + +![GenAI template architecture]({{ site.github.url }}/colive/assets/basic.png "GenAI template architecture") + +## Creating a bespoke chatbot + +Our app is still unable to report on the game last week. However, our app now has a backend layer we can tailor to whatever needs we wish. One way to implement this logic could be the following + +1. Take the question from the user and extract the football teams mentioned +2. Look up the last game played between the teams mentioned +3. Return a coherent response to the user containing that game data we extracted in step 2. + +To complete these steps, we are going to build an extractor function to identify the teams mentioned, a lookup function to find the data for the teams' last game and a summariser function to bundle up the data into a nicely written response. Exploring step 2 isn't of relevance to this article as it does not involve an LLM. We could use a local database and query it in SQL, or a tool that searches the web or requests data from football API for the game results. Either way this functionality is not novel or new; let's instead explore steps 1 and 3. + +## Creating an extractor method + +The method to extract the specific football teams from a user’s utterance could be overly complex, one way to achieve this could be to search through a list of pre-defined keywords. However, this is a task LLMs are surprisingly good at. See the following prompt. + + +~~~ +You are an expert extraction algorithm tasked at extracting what football match (or matches) the user wants to know about. You must return the 2 football clubs that the user has asked about from the question. Return the 2 clubs in a comma separated list. + +Football clubs must be referred to using one of the following names + +- Arsenal FC +- Brighton & Hove Albion FC +- Chelsea FC +- Crystal Palace FC +- Everton FC +- Southampton FC +- Watford FC +- West Bromwich Albion FC +- Manchester United FC +- Newcastle United FC +- AFC Bournemouth +- Burnley FC +- Leicester City FC +- Liverpool FC +- Stoke City FC +- Swansea City AFC +- Huddersfield Town AFC +- Tottenham Hotspur FC +- Manchester City FC +- West Ham United FC + +For example, for the question + +"What happened the last time arsenal played spurs?" + +You should return + +"Arsenal FC, Tottenham Hotspur FC" + +If you do not know the value of an attribute asked to extract, return null for the attribute's value. + +The question: {question} +~~~ + +This prompt is engineered to ask the LLM for a specific outcome - find from the question provided two football teams. The specific football clubs that should be returned are listed (this data is from Premier League teams during the 2017/18 season in case you are wondering why Huddersfield Town are there). There is also an example of One-Shot Learning - where an example model answer has been provided. Finally, it is mentioned that if the answer cannot be found the LLM should return null and not hallucinate a response (an LLM returning misinformation confidently as fact is often described as the LLM hallucinating). + +This works surprisingly well! Beyond clear questions specifying the names of the teams, GPT-3.5 was able to identify the two teams involved in the Manchester Derby and could even understand "the blues" as Chelsea FC. + +## Creating a summariser method + +After step 1 and step 2 we need to return to the user the game data found in a Chat Bot style response. This again is remarkably simple using an LLM, see the example prompt below. + + +~~~ +You have been asked to find the results of matches between the following Premier League football teams + +here is data about the games in a python dictionary format + +{matchData} + +Summarise a response to the question using the data listed above. +Only use data listed above. +Do not infer any data outside of what is listed above. +~~~ + +Like the extractor prompt, the content within the curly brackets will contain the actual match data. In my example I injected the string representation of a python dictionary containing key value pairs of game data (half-time score, home team, date of the match, etc.). We heavily emphasise in the end of the prompt to only use mentioned data to deter the model from hallucinating. + +## Wire it all up + +Now we have all 3 steps, connecting them all together gives us the following architecture. + +![PunditBot architecture]({{ site.github.url }}/colive/assets/complete.png "PunditBot architecture (with database integration)") + +With the LLM's new ability to extract the teams mentioned in the user’s utterance, find the data related to those teams and bundle it up in a well written response we can now successfully ask about a specific football game. Note in the screenshot below the data has come from a game in the 2017/18 season - for step 2. I have used a [dataset of games from that period](https://github.com/atriantus/Neo4j-Graph-for-EPL-data) in a Neo4j graph database. + +![Asking PunditBot about the game]({{ site.github.url }}/colive/assets/askingPunditBot.png "Asking PunditBot about the game") + +Now there are obvious flaws with the current version I am showcasing above. + +- As I am using a static database of games from just 1 season PunditBot is only able to retrieve data from those games to find out the "last" game that occurred + - This can be solved by adding a better way of getting the data +- The UI includes unnecessary information around if "RAG mode" is enabled or not or what model is being used + - These are just parts of the UI that came with [the docker genai-stack project](https://github.com/docker/genai-stack) I based the project from and can be cleaned up +- The response is poorly formatted and hard to read + - Some tweaking of the summariser prompt could be used to enforce new lines between the 2 game reports, plus other visual adjustments if necessary. + +Ultimately, this article is concerned with the application of utilising an LLM with external tool. What has been proven here is it is possible to take an LLM model like GPT-3.5 and add new functionality around it to make it more useful for specific applications. There are frameworks (such as [LangChain](https://www.langchain.com/)) that are entirely based on this idea. The idea could be taken a lot further by plugging in more tools - for example we could return a video highlight of the winning goal to both games and render that beneath the game reports. + +With some surrounding logic it's possible to achieve a lot more functionality with existing LLMs. I hope you found this interesting and potentially inspiring into what's possible in this area of the AI space. diff --git a/_posts/2024-04-11-Carbon-Hack-24--Leveraging-the-Impact Framework-to-Estimate-the-Carbon-Cost-of-Cloud-Storage.md b/_posts/2024-04-11-Carbon-Hack-24--Leveraging-the-Impact Framework-to-Estimate-the-Carbon-Cost-of-Cloud-Storage.md index f9e6d426c1..c5d620734e 100644 --- a/_posts/2024-04-11-Carbon-Hack-24--Leveraging-the-Impact Framework-to-Estimate-the-Carbon-Cost-of-Cloud-Storage.md +++ b/_posts/2024-04-11-Carbon-Hack-24--Leveraging-the-Impact Framework-to-Estimate-the-Carbon-Cost-of-Cloud-Storage.md @@ -10,6 +10,7 @@ summary: This is a post on Scott Logic's contribution to the Green Software Foun 2024 Carbon Hack. Focussing on the energy cost of object storage in the cloud, this post details our process, challenges and experiences during the hackathon. author: mgriffin +image: "/uploads/carbonhack24%20thumbnail.png" --- ## Introduction @@ -56,6 +57,10 @@ Another important aspect that wasn’t being covered was the embodied carbon inv We added this estimation into the common crawl example, making use of some other built-in plugins to change data stored and drive size into the resources-reserved and resources-total figures required by the SCI-M plugin. I think this really highlighted the power of the Impact Framework in terms of creating customised calculation pipelines without writing any more code. Perhaps unsurprisingly, the potential embodied carbon cost turned out to be greater than our operational carbon estimation – another important factor that is not accounted for in AWS carbon reporting. +Here is a flowchart demonstrating our final plugin flow: + +![Common Crawl Manifest flow]({{ site.github.url }}/mgriffin/assets/common-crawl-private.svg "A flowchart demonstrating the inputs to the common-crawl-private manifest and how the plugins generate outputs.") + ## Conclusion Overall, our experience with the 2024 Carbon Hack was extremely rewarding. We were impressed by the rapid pace of changes being made to the Impact Framework, and we're excited to see how other teams have pushed the boundaries of this powerful tool. As the tech industry continues to grapple with its environmental impact, free and open-source tooling with flexible use cases will be essential in increasing adoption of Green Software practices. \ No newline at end of file diff --git a/_posts/2024-04-15-how-cxos-can-spot-technology-greenwashing.markdown b/_posts/2024-04-15-how-cxos-can-spot-technology-greenwashing.markdown new file mode 100644 index 0000000000..cc9b61bca5 --- /dev/null +++ b/_posts/2024-04-15-how-cxos-can-spot-technology-greenwashing.markdown @@ -0,0 +1,63 @@ +--- +title: How Can CxOs Spot Technology Greenwashing? +date: 2024-04-15 14:40:00 Z +categories: +- Sustainability +- Technology +tags: +- Sustainability +- CxO +- Advisory +- Technology +- '' +- green technologies +- greenhouse gases +summary: 'In the era of increasing environmental awareness, many organisations are + eager to showcase their commitment to sustainability. However, not all claims of + "green" or "eco-friendly" technology are accurate or substantiated. As an executive, + it is becoming increasingly important to distinguish between genuine sustainability + efforts and greenwashing -- the practice of making misleading or unsubstantiated + claims about the environmental benefits of a product or service. This blog introduces + approaches that can help you cut through the noise and make informed decisions about + your organisation''s technology sustainability. ' +author: ocronk +image: "/uploads/CXOs%20greenwashing.png" +--- + +In the era of increasing environmental awareness, many organisations are eager to showcase their commitment to sustainability. However, not all claims of "green" or "eco-friendly" technology are accurate or substantiated. As an executive, it is becoming increasingly important to distinguish between genuine sustainability efforts and greenwashing -- the practice of making misleading or unsubstantiated claims about the environmental benefits of a product or service. This blog introduces approaches that can help you cut through the noise and make informed decisions about your organisation's technology sustainability. + +![tech-impacts.PNG](/uploads/tech-impacts.PNG) + +The technology sustainability landscape is complex, with a multitude of factors contributing to the environmental impact of technology. This complexity can lead to misinformation and greenwashing, making it challenging for organisations to navigate the landscape effectively. What is needed are transparent and reliable resources, helping organisations make informed decisions about their technology sustainability efforts. It’s also important to acknowledge the broad range of impacts of technology – beyond the more direct and obvious electricity consumption of devices – to include the often overlooked upstream and downstream impacts. + +## Signs of Technology Greenwashing: + +1. Vague or unsubstantiated claims: Be wary of technology providers that make broad, generic statements about their products being "green" or "eco-friendly" without providing specific data or evidence to support these claims. + +2. Lack of transparency: If a technology provider is reluctant to share detailed information about their sustainability practices, emissions data, or the methodology used to calculate their carbon footprint, it may be a red flag for greenwashing. + +3. Cherry-picking data: Some companies may selectively disclose positive sustainability metrics while ignoring less favourable ones. The TCS encourages a comprehensive view of technology's environmental impact, making it easier to spot instances of cherry-picking. A common example of this is making claims that a service is sustainable as it uses electricity generated by renewable or low carbon sources. For one thing this is very hard to achieve in reality (given the complex nature of electricity grids) and [secondly it misses a lot of other areas of impact that technology has](https://blog.scottlogic.com/2023/10/26/conscientious-computing-facing-into-big-tech-challenges.html). + +4. Offsetting without reduction: Whilst carbon offsetting can be a part of a sustainability strategy, it should not be used as a substitute for actual emissions reduction. Be cautious of technology providers that heavily emphasise offsetting without demonstrating a commitment to reducing their carbon footprint. + +5. Inconsistency with industry standards: Look for alignment with the [GHG Protocol](https://ghgprotocol.org/), a widely accepted standard for emissions reporting and other standards such as [SBTi](https://sciencebasedtargets.org/). If a technology provider's sustainability claims seem inconsistent with industry standards, it may also indicate greenwashing. In some cases vendors are even going as far as to redefine the agreed definition of terms such as Net Zero. + +6. Box-ticking mentality: Some companies may focus on ticking boxes rather than providing substantial evidence of their sustainability efforts. The lack of agreed-upon measurement standards can contribute to this issue, and we have been working on a proposed Open Source standard - that aims to address this by helping establish a consistent approach. + +## What is the Technology Carbon Standard? + +The [Technology Carbon Standard (TCS)](https://www.techcarbonstandard.org/) is an open, standardised framework for assessing and categorising the environmental impact of enterprise technology estates. It serves as a bridge between CxOs / decision-makers and technologists by providing a common language and framework for discussing and assessing the environmental impact of technology. By aligning with the Greenhouse Gas (GHG) Protocol's emissions scopes, the TCS enables consistent mapping, measuring, and prioritisation of a technology's carbon footprint. This transparency empowers CxOs to make data-driven decisions and identify areas for improvement. + +![tcs.PNG](/uploads/tcs.PNG) + +## Leveraging the TCS for Transparency: + +By adopting the [Technology Carbon Standard](https://www.techcarbonstandard.org/), technology organisations can drive transparency and accountability in their organisation's technology sustainability efforts. The TCS provides a consistent framework for assessing and reporting the environmental impact of technology, making it easier to identify and address instances of greenwashing. + +Furthermore, the TCS has been designed to facilitate collaboration and knowledge-sharing between sustainability stakeholders, technology leaders, and practitioners. By getting their teams to engage with this open standard and the community behind it, CxOs can stay informed about best practices, industry trends, avoid greenwashing and steer towards meaningful action. + +It is important to acknowledge that are gaps and inconsistencies in scope 3 emissions reporting, which can lead to incomplete or misleading sustainability claims. The TCS encourages comprehensive reporting and helps identify areas where more data or transparency is needed. + +## Conclusion: + +As environmental challenges continue to shape business decisions and stakeholder concerns, CxOs must be vigilant in identifying and addressing technology greenwashing. The Technology Carbon Standard provides a powerful tool for driving transparency, consistency, and accountability in assessing the environmental impact of enterprise technology. By leveraging the TCS and staying informed about sustainability best practices, CxOs can lead their organisations towards genuine, substantiated sustainability efforts and contribute to a more sustainable future for technology. Get in touch with the friendly team at Scott Logic to join the growing number of organisations that are using the TCS to drive transparency, measurement and improvement in the impact of the their tech estate. diff --git a/_posts/2024-04-15-the-changing-world-in-life-it-software-testing-and-artifical-intelligence.md b/_posts/2024-04-15-the-changing-world-in-life-it-software-testing-and-artifical-intelligence.md new file mode 100644 index 0000000000..05726720c4 --- /dev/null +++ b/_posts/2024-04-15-the-changing-world-in-life-it-software-testing-and-artifical-intelligence.md @@ -0,0 +1,97 @@ +--- +title: The Changing World in Life, IT, Software Testing and Artificial Intelligence +date: 2024-04-15 00:00:00 Z +categories: +- Testing +summary: This blog looks at the changing world of the IT industry and the psychology + behind the changes. +author: lperrett +image: "/uploads/The%20changing%20world%20thumbnail_.png" +--- + +## Introduction + +This blog is a unique exploration identifying the intersection of IT and Psychology. It demonstrates the dynamic relationship between working in the IT industry and the human behaviour of how we perceive and implement change. Whilst this document is neither IT nor Psychology specific, it demonstrates the “dance” of how two worlds merge and how we embody these worlds symbiotically. + +## Setting the Scene - A Personal Observation + +After 15 years in the IT industry, I have seen multiple shifts in Software Testing, which has been an interesting and challenging experience. As I reflect on my journey in Software Testing, I see how important it has been to keep studying and learning new concepts and ideas, and putting them into practice. It is so humbling to see new languages, models, theories and implementations being developed every year by the intelligent and courageous people who want to make a difference to the world we live in. Yet somehow we are expected to magically keep pace with all this constant change. We learn that sometimes implementations will be welcomed and businesses will experience great success and other companies aspire to be like them; they become the role models or “thought leaders”. On the other hand, we may experience concepts being born but not “taking off”, perhaps because they were born too early or people simply did not see the benefit. + +## Concept of Change + +This brings me to the concept of “CHANGE” with a multitude of questions – to name a few: + +- How do we change ourselves to accept the world around us? +- Why are we expected to change? +- Does it psychologically sit comfortably with us or does society expect it of us? +- Do we change to survive in an ever-changing world and mask who we are and what we need? +- Do we even like the change that is right in front of us? +- Are we changing to fit in? +- Are we excited about the new changes that are ahead of us? + +Change can be implemented in two ways: + +- We are the “thought leader” implementing the change. +- Change is forced upon us and we are expected to adapt. + +When we implement change, we may expect others to “magically” adopt what we have implemented. We may not be cognizant of the other person’s experience or personal journey, perhaps the person does not wish to share their thoughts, or perhaps the person is not even known to us meaning we cannot receive feedback. This can make change difficult. + +When change is forced upon us, this is when we have to survive in an ever-changing world. It means we need to be open to new ideas and concepts, but we must never stray from who we are and what we believe in. Sometimes there is a “silent fight” that occurs inside of us, but over time we slowly learn to accept why something is in place. We learn to openly discuss and ask “why”, which helps us to understand the world around us, and sometimes the fight is no longer possible and does not align with our inner desires. Perhaps our perspective will change with additional knowledge, or perhaps not. We can practise being okay with this feeling. + +This is not necessarily just about the IT Industry, but about how we live our lives. We may be the “silent voice” in the background that is sceptical, or who sees the world differently. We may be the “overly excited voice” that fully embraces the change and wants to lead with others at the front because we can see exciting new prospects and how the change can benefit society. + +## Creating Context: Bridging IT and Psychology + +Now let’s bridge the worlds of IT and Psychology together by exploring the interplay behind technological decisions and psychological implications. In the forthcoming sections, we delve into where these two disciplines converge, providing emphasis on the transformative aspects of change inherent to the IT industry. Integrating these perspectives and sharing my personal experience and observations, I aim to unveil valuable insights into the evolving landscape that shapes the IT industry today. + +## Academic Questioning + +During my academic studies, lecturers would often explain their life experiences and would happily share their wisdom. I personally love to hear of people’s experiences/stories that they hold so dearly in their heart. Lecturers would often be asked “Why are you doing it?” (it = implementation of a feature or idea and turning it into software). They would answer from their own experience when people asked them this question. They would answer: “well…I want to make a difference and I feel it could really change how we work in the world,” “well…I’m just learning and I want to experiment and see where it takes me,” “well…I’m starting this project and I’m piloting how it will look and I require feedback for my concept to take off,” “well…because I can.” + +The latter always made me laugh, and I never understood it, but using this answer throughout my life helped other people to understand that we can do something “because we can”. If we never try new things, we cannot change and grow as individuals or organisations. Change can be scary, but change can also be great. Change should be used wisely. We should really think about why we want to change, what prompts us to stare at the same piece of work, the same theory, the same working model, the same piece of code and ask ourselves “why am I repeating this activity?” + +## Changing to Survive or Thrive + +Change can happen either unconsciously or consciously. Certain psychological processes can happen unconsciously in the background, but conversely, we can consciously try to adapt. When we adapt consciously, we change ourselves through our goals, continuous learning and self-reflections, to name a few. This helps us to venture into the concept of change and this makes us more adaptable and more resilient. + +Change is constantly happening around us, and we have evolved as humans to adapt to changing circumstances. However, this is where some difficulties may present themselves as we can question why we change with ease or why we experience difficulties. This is where we can feel how we perceive the change and what impact it has on ourselves, our projects, our businesses, our industries and our society. We may be able to influence that change for ourselves or others around us directly or indirectly. We can also address our choices as to whether to accept, reject or challenge the changes in relation to our personal lives or the professional products that we are delivering for the IT industry. + +Adapting to change is a natural process, especially in today’s fast-paced and ever-changing world. Changing to survive does not necessarily mean we have to mask who we are or what we need, as it is essential to balance the new circumstances with our inner desires. Authenticity can create a genuine connection with ourselves and others who may be experiencing the same thing. We can be self-aware and flexible to our needs and preferences, which helps us to determine if we like what we see or whether we feel pressure to adapt. + +Change can promote resilience, which helps us to deal with the uncertainty that it brings. Once we settle into the change, it can become our new version of “normal” and allow us to accept that we have to adapt to changes in the future. + +## Mentoring + +Mentoring can be so important in experiencing and embracing change. A personal mentor sees the world differently from the perspective of the “other”. If you are lucky and have a super-wise mentor, they will help you see that it is not just about your perspective, or their perspective, but the perspectives of many, which uncovers many more paths for us to explore. Ultimately, our voice, our body language and how we perform the implementation have the potential to change the lives of many and they will be the next generation to pass on your wisdom with adaptations along the way, of course. After all, people like to make things their own. + +## Project Engagement + +After working on a multitude of projects and sometimes multiple projects at the same time, each project requires something different. We have to change our approach; we have to actively listen to what our clients and colleagues need. If one client needs x, it does not mean the next client needs the same thing. If we delivered x to client number 2, perhaps they would be really unhappy because we did not meet their needs or expectations. This means we have the potential to change our approach, to listen to them, to create a rapport with them and create trust that if something went wrong, then it does not mean it was malicious or that it failed. Instead, perhaps we simply didn’t understand what was expected, perhaps it wasn’t right for us, or perhaps we didn’t yet find a suitable path to perform what was needed, but we still have the opportunity to return to that venture in the future. Even when things go wrong, we can change, the client can change, the individual can change and we can take this as an opportunity to enhance our collaboration and change how we work together. Ultimately, this message shows that we can learn and grow together by influencing and embracing different viewpoints. + +Change Management is an important concept within the IT industry, encouraging us to think about the impact change has on our projects, any issues that may arise and how we communicate through an organisation. This implies that we need to think about the psychology of how someone/team/company will receive the changes and how it will affect them on an individual/personal level. + +In the IT industry, it is important to be open to the possibility of change, as we understand that it is an ever-changing industry. Someone may feel that they have become an expert in a certain area, but then the next big thing comes along and suddenly a new set of learning needs to take place. + +It is important on projects to earn the trust of the team and the client, as it promotes collaboration and helps create a supportive network when making changes. This promotes good feedback and helps to deliver high quality products. + +## Software Testing Changes + +I have seen many changes in Software Testing. Manual Testing had huge importance 15 years ago, it fundamentally was framed within the Software Development Life Cycle (SDLC) and Software Testing Life Cycle (STLC) using theoretical models such as Waterfall and V Model. We learned that we had to write test conditions, test groups and test scripts whilst documenting all our work. Many testers would not even engage in automation testing at that time. 15 years later there is a predominant viewpoint that automation is crucial in being a successful software tester. There have been changes to our titles, e.g. Software Developer Engineer in Test (SDET) or Software Test Engineer whereby there is an emphasis on the importance of being engineers. This means there is a focus on the technical attitudes behind our work. However, we must not forget the art of traditional software testing. That is an opportunity for a different blog for another day! + +Software testing changes mean that we can add value to the software being created and use a context-based approach to promote quality within the work that we deliver. The feedback that testers give to the project is critical in ensuring that the products are meeting the requirements. + +## Artificial Intelligence - The Changing Landscape + +The world changes again, we are now entering into the evolving landscape of Artificial Intelligence (AI) and Machine Learning (ML). It has been so prominent in the news and industry in the last 6 months and we can definitely understand that companies are evolving their businesses to keep up with the ever-changing demands. We might choose to embrace this new world or we may enter into resistance because we are not ready to change to this new way of living. As testers, it is definitely worth understanding what AI and ML can do and how it has the potential to change how we work. Again, we can explore this together in a future blog. + +## IT and Psychology Summary + +The IT industry is constantly evolving through technological changes, consumer needs and market demands. It is often difficult to keep up to date with all of the changes. Advancements such as AWS, Cloud Computing, and Artificial Intelligence mean that the IT landscape is changing and creating new opportunities for businesses to operate with a competitive edge in digital transformation. + +Change in the IT industry means that we need to scale our business models, offer greater flexibility and adapt to the ever-growing needs of society. Upskilling and re-skilling are particularly important, but only at the pace of the individual when they are ready to embrace the change. + +In the last 6 months, I’ve personally learned that change is all around us. Forever we will change, society will change, industry will change. We can actively participate in change, or we can resist it forever or until we personally feel we are ready. We can even be a thoughtful observer as we have the opportunity to observe what is happening around us without judgement. Entering into silence is an important concept when we implement “observation without judgement”, which ultimately helps us to enter into more carefully selected decisions of how we would like to move forward with our lives, our careers, our businesses, our industry and our personal/professional lives. + +Change can be difficult but finding our inner “Heartfluencer” (heart-centred leader who has potential in life) can help us embrace the changes that society moves us to, or even our inner self. We may be ready for those changes or we may not be ready. One day we might choose to embrace something different which ultimately forms our new paths forward, taking us on a different journey, a different adventure and providing us with different experiences. We may not like those paths, but at least we can stand proud and say, “I chose to explore and I now know what that path looks like for me.” Surely, this is better than having regrets, realising it is too late, or wondering “what if” during our reflective times. There will always be opportunities to explore, there will always be opportunities to change and make a difference, there will always be opportunities to say, “no this is not for me” or “I’m not ready for this yet” and there will always be an element of resistance somewhere in our lives - sometimes to the things that we need the most! I’d like to emphasise that being able to observe this and have compassion during changing times is so important for our wellbeing. It may also invite others to have compassion for their journey too, making us a true “Heartfluencer”. + +I hope this blog helps you take a different approach to exploring change. Enjoy exploring your paths with a new perspective! diff --git a/_posts/2024-04-17-is-agile-the-answer.md b/_posts/2024-04-17-is-agile-the-answer.md new file mode 100644 index 0000000000..11efada7db --- /dev/null +++ b/_posts/2024-04-17-is-agile-the-answer.md @@ -0,0 +1,60 @@ +--- +title: Is Agile always the answer? +date: 2024-04-17 08:00:00 Z +categories: +- People +tags: +- agile +- scrum +- team +summary: In software development we often make the assumption that Agile is the best + way to manage process on a project but just what do we mean by that, is it true + and, if so, why are there so many examples of it not working for teams? In this + post I try to take a deeper look at some of these questions. +author: dogle +image: "/uploads/is%20agile%20always%20the%20answer_.png" +--- + +## Just what is Agile anyway? + +I feel that I am often guilty of making an assumption that Agile is the best approach to software development in general. Recently I've been considering if it really is the best approach for a lot of teams, how it fails and why it fails. If, when it does fail, that was a failing of the team to implement it correctly or if it's more than that, if perhaps Agile wasn't the right tool for the job in the first place. When it fails, which it does sometimes, the argument often seems to come down to "Agile doesn't work (at least not for everyone)" versus "you're just doing it wrong". + +The first thing to consider here seems to be how we define the term "Agile" with relation to software development. When trying to define Agile I always am brought back to the Agile Manifesto, after all, if this can't be held up as a definition of Agile then what can? There's surprisingly little in [here](https://agilemanifesto.org/), it makes no mention for instance of story points, stand up, sprints or many of the other ceremonies and ideas commonly associated with Agile Development. What it does have is some simple guidance around collaboration, working iteratively, and prioritising getting working software out the door over getting bogged down in "the plan". Sounds great, what could possibly go wrong? + +## A more realistic definition? + +The Agile landscape can be confusing. People often use Agile and Scrum interchangeably as though if you are doing Agile development you are, by definition, doing Scrum. A quick glance back at the Agile Manifesto shows us that this isn't really true and that there's a whole load of stuff that comes with Scrum that is simply not needed in order to be “Agile” by that definition. That said there is, on the other hand, a general acknowledgement I think that most teams are doing a variation of Scrum, Lean or Kanban styles of Agile. So if we are talking about what is and isn't Agile, then there is an argument to say it's more helpful to acknowledge this is what people tend to mean when they use the term, over sticking rigidly to the manifesto definition. +But that's not even the end of the story, it's also true to say no two Agile teams will be exactly the same and that each team will have its own take on Agile, often based on one of those three popular variations mentioned above. Indeed, one of the main philosophies of Agile is being adaptive to change and to just do what works over following a plan. Easy then to see how all this can be somewhat bewildering when trying to pin down what we mean by Agile Development or get started implementing it. + +## Where to start? + +This I think leads into where teams can start to struggle implementing Agile. If you are looking to start using Agile Development, it can be very hard to know where to begin. This can be especially true when you or your team has little experience with Agile to begin with. The Agile Manifesto isn't much help here of course, four short statements prioritising one thing over another isn't much of a guide to getting a project off the ground (nor is it meant to be). This is where Scrum can come in, Scrum has a handy [guide](https://scrumguides.org/scrum-guide.html), there are certified Scrum Masters to help out and it's widely used in industry. It's easy then to see why it's a popular starting point if you're looking to create a new Agile team without much prior experience. + +Scrum is fairly prescriptive, it builds on the Agile manifesto and goes further, laying out a set of roles and events to apply in day to day development. This might work well for you and your team but then again it might not. If you are not prepared to move away from what's written in the guide if it isn't working for your team, then there's a good chance it won't succeed for you long term. On the other hand, if you don't have some guide to follow but just some high level ideals to guide process, it can be hard to get started when you don't have much prior experience to lean on. + +## The problems with sticking to the plan + +What's important then when starting off with Agile? Why do projects struggle and why is there so much confusion and argument about how to get it right? +I think the hard truth here is that following any given strict plan almost certainly won't work well for your team long term, even if it works initially. The Scrum guide is just that, a guide, rather than a strict how-to. A healthy Agile team is going to take a little time to find its groove. There's no easy way of telling what process you should adopt upfront; rather, you will need to just pick a starting point, probably based on something like Scrum or what you have experienced in other teams but then allow the team to mould the process by trying new things and dropping things that are not working. Don't be tempted to ring fence particular things from being changed either. For example, maybe you strongly feel that stand-up is important but if the team votes to drop it or change the format you should at the very least be open to trying working without it for an iteration or two and see if things improve. The less the team has the ability to change things the harder it will be for them to find their groove and create a process which works well for them. It's also important to remember that as the team and the project changes, so too will the processes need to. So even if you have found a great way of working for the time being it will probably need to evolve over time as the project and team members change. + +It doesn't matter how you start off or even where your team is right now. If you're setting things in stone you're lining yourself up to fail. Agile teams regularly reflect on where they are and make changes in what they’re doing and how they work to match that. This aspect of Agile is far more important in my opinion than the ability to absorb changes to requirements at any stage in a project. The team needs to regularly be looking at the processes and ceremonies they have and asking what the value is for each one. If it's helping you deliver quality software or if it's using up time. What would happen if you try something new or even just stopped doing a thing? Changes in process can be a bit disruptive but if you drop let's say, stand-up for a week for example, the project is unlikely to fall apart, and if it does you probably have bigger issues. If things get better you don't go back, if it's worse you just put it back and try something else. The important thing is that it's the core team who makes those calls, not managers, Product Owners or others outside the core team. The team needs to be empowered to make calls and actively take on ownership of how they deliver software. + +## The importance of self organising teams + +It stands to reason that Project Managers will be heavily involved in setting up new projects, teams and processes, that is after all their domain. What I think is important though is that once the team is up and running that process is then handed over to the team. I believe self organising teams is one of the most important factors in a successful Agile team. It allows a team to be truly Agile in the sense that they can adjust and reshape their ways of working as the project evolves. It also embodies trust, i.e. when management trusts a team to be self organising, they get the most out of that team and the team develops a feeling of ownership and pride in their work and processes. It encourages the team to not only point out what isn’t working but to be proactive in imagining what improvements could be made and how things could be better. That's not to say there is no requirement for a Project Manager; rather, the core team's ways of working and ceremonies should be owned and managed by the core team. + +## Agile and teamwork + +If we take a look at the [Agile Manifesto](https://agilemanifesto.org/) again it is, like I said earlier, just some general advice on what to focus on when developing software. I believe that the Agile Manifesto and Agile development in general has strong ties to teamwork, good Agile teams are good teams in general and Agile is a key part of facilitating that teamwork. In my opinion great teams have the following properties: strong communication coupled with a clear hierarchical structure which utilises delegation to promote and facilitate independent, creative thought and initiative. Let’s break that down a little bit, if we look at the items on the left of the Agile Manifesto, two of the four ("Individuals and interactions" and "Customer collaboration") are around communication, along with some of the principles. A lot of the common ceremonies we associate with Agile are built around the idea of facilitating better communication both within the core team and with others, such as stand-up and sprint demo. + +It's important in any team that someone is in charge of the final decision on things, able to drive things forward, keep an eye on the bigger picture and resolve stalemates. This may be a different person for different issues in larger teams but that person needs to exist. Self organising teams is not about removal of that leadership but instead about delegation of responsibility for the day to day process of the team to the team itself. It encourages independent creative thought and a sense of shared ownership of process by the team. +When a developer picks up an item of work they are expected, to a greater or lesser extent, to own that piece of work. Giving a developer autonomy in how they implement the work and drive it forward to completion encourages a sense of ownership and pride in that task. As a result, a developer is more likely to bring their creativity and initiative to bear in implementation and to ensure it’s of high quality and tested. This, coupled with an environment where they are encouraged to lean on the rest of the team for help and use the team Lead to unblock problems where there is no clear path forward, enables people to do their best work. +In the same way, we should be providing the team as a whole with the autonomy to develop their own processes in an environment where they can lean on each other and look to a Project Manager to help guide and unblock process related issues where there is uncertainty. This ultimately will enable them to bring their collective creativity and pride to the development processes as well as individual tickets, producing the best results. + +## Why Agile can fail + +Why does Agile fail then? Probably for a multitude of reasons, but I suspect that, ultimately, one of the main reasons is a failure to adapt processes, i.e a failure of the team to change their ways of working appropriately to suit the current circumstances. This may manifest in various ways, such as teams sticking religiously to processes and ceremonies that are not adding value or worse, actively detrimental to progress and morale. It may be that teams are too cautious about trying new things and new approaches or that they are actively blocked from doing so through not having true ownership of those processes. They may be struggling to identify the things that are causing issues due to a lack of regular reflection or because people do not feel empowered to point out where the pain points are. Whatever the reasons, in the end the result will be the same, the development process will falter and team morale will suffer. If the ceremonies and processes that are commonly associated with Agile are seen to be part of the problem, it’s easy to see how Agile as a whole could be given the blame when things go wrong. Another potential pitfall can be attempting to either scale up one teams successful implementation of Agile to multiple teams or else, create a standardised Agile approach which is mandated to all teams. Either way, creating a uniformed Agile approach again, prevents the team from being able to modify their own processes. In this scenario, as before, Agile maybe seen as the underlying issue when problems arise when in fact it is this "prescribed Agile" approach which is preventing the team from adapting and responding to change. + +## Is Agile always the best approach? + +Is Agile the right fit for every project? No, probably not. However, if the question is "can Agile be applied successfully to most software development projects?" I think yes, in my opinion that still holds true, all things considered. The super important caveat here however is that I believe this is true **only** for a given definition of Agile. That definition being the high level Agile Manifesto definition we talked about earlier, i.e being Agile in the sense of aligning the team's ways of working with the Agile Manifesto and principles. I think that the Agile Manifesto and the twelve Principles behind it remain good solid advice for software development and that the majority of projects can benefit from working in an collaborative, iterative manner, utilising self organising teams who regularly reflect on their ways of working and adjust to suit the current circumstances. I do not think that Kanban, Scrum or any other given frameworks will always be the best approach for most projects, nor do I think that any single rigid process will work well for a team indefinitely. Teams change shape, size and goals regularly and a successful Agile team needs to have both the will and the power to adapt with those changes. diff --git a/_posts/2024-04-18-beyond-the-hype-are-data-mesh-and-data-fabric-just-marchitecture.markdown b/_posts/2024-04-18-beyond-the-hype-are-data-mesh-and-data-fabric-just-marchitecture.markdown new file mode 100644 index 0000000000..706e9cb1bf --- /dev/null +++ b/_posts/2024-04-18-beyond-the-hype-are-data-mesh-and-data-fabric-just-marchitecture.markdown @@ -0,0 +1,40 @@ +--- +title: 'Beyond the Hype: Are Data Mesh and Data Fabric just Marchitecture?' +date: 2024-04-18 10:35:00 Z +categories: +- Podcast +- Data Engineering +tags: +- data +- data mesh +- data fabric +- data architecture +- data engineering +summary: In this episode, Oliver Cronk, Andrew Carr and David Hope talk about the + ever-changing world of data, with conversations moving from data warehouse to data + lake, and data mesh to data fabric. They discuss the importance of data ownership + and common tooling, and their view that data mesh is an approach rather than an + architecture. +author: ceberhardt +image: "/uploads/BeyondTheHype---green-and-orange---social---E17.jpg" +--- + + + +In this episode, Oliver Cronk, Andrew Carr and David Hope talk about the ever-changing world of data, with conversations moving from data warehouse to data lake, and data mesh to data fabric. They discuss the importance of data ownership and common tooling, and their view that data mesh is an approach rather than an architecture. + +**Links from the podcast** + +* [Data Mesh](https://www.oreilly.com/library/view/data-mesh/9781492092384/) by Zhamak Dehghani + +* [Data Fabric as Modern Data Architecture](https://www.oreilly.com/library/view/data-fabric-as/9781098105952/) by Alice LaPlante + +* [Principles of Data Fabric](https://www.packtpub.com/product/principles-of-data-fabric/9781804615225) by Sonia Mezzetta + +**Subscribe to the podcast** + +* [Apple Podcasts](https://podcasts.apple.com/dk/podcast/beyond-the-hype/id1612265563) + +* [Google Podcasts](https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5saWJzeW4uY29tLzM5NTE1MC9yc3M?sa=X&ved=0CAMQ4aUDahcKEwjAxKuhz_v7AhUAAAAAHQAAAAAQAQ) + +* [Spotify](https://open.spotify.com/show/2BlwBJ7JoxYpxU4GBmuR4x) \ No newline at end of file diff --git a/_posts/2024-04-19-monitoring-codebuild-build-status.md b/_posts/2024-04-19-monitoring-codebuild-build-status.md new file mode 100644 index 0000000000..c8bf97da70 --- /dev/null +++ b/_posts/2024-04-19-monitoring-codebuild-build-status.md @@ -0,0 +1,123 @@ +--- +title: Monitoring AWS CodeBuild Build Status +date: 2024-04-19 00:00:00 Z +categories: +- Cloud +tags: +- Tech +- AWS +- Cloud +summary: Adventures into event-driven and serverless architecture… +author: bpritchard +image: "/uploads/Monitoring%20AWS.png" +--- + +## The Internal App Portal Project + +What do you do when you’re a software consultancy that uses small, internally developed applications, and you need to be able to spin those applications up on demand? You build something! + +Part of life at a consultancy means stints on the bench, allowing us to participate in internal projects. One of those projects is the ‘Internal Application Portal’ (IAP). A team of developers and testers, often rotating due to being pulled onto client projects, have researched, designed, and built a web application for internal use, which deploys containerised versions of our small applications on demand. + +The IAP consists of a Typescript, React, and NextJS web application, deployed on AWS. The small apps, known as child apps, are containerised, and the images are pushed to ECR. From the main IAP web app (the admin app), the user clicks a button, which sends a request to AWS CodeBuild through the SDK, triggering a build. + +[CodeBuild](https://docs.aws.amazon.com/codebuild/latest/userguide/welcome.html) is AWS’s fully managed continuous integration service. Developers can configure build jobs, and then CodeBuild runs the build scripts, without needing to configure servers. In the case of the IAP, CodeBuild runs the child app Terraform scripts, which then deploys the infrastructure to host the child apps. The child app Terraform scripts are stored on S3, which are zipped and pushed when the IAP Terraform is deployed. + +![IAP Simplified Architecture]({{ site.github.url }}/bpritchard/assets/codebuild/arch-simplified.png "IAP Simplified Architecture") + +Among the requirements of the IAP project are that the child apps must: + + 1. Be ephemeral + 2. Be able to be destroyed + 3. Have unique URLS + 4. Only have one instance deployed for each user + +These requirements mean that the IAP must keep track of the state of the CodeBuild build jobs for the child apps. + +Initially, the build job status was retrieved by polling CodeBuild, using SDK. Polling has its limitations and felt like a clunky way of doing things, so we looked to AWS, which would surely have a better solution. + + +## EventBridge + +A vast majority of AWS services generate [events](https://docs.aws.amazon.com/eventbridge/latest/userguide/eb-service-event.html) when there is a change in the service or application. AWS EventBridge can have rules that listen for certain events; for example, a rule could listen for when an EC2 instance changes from ‘Pending’ to ‘Running’. In the case of IAP, a rule was created to listen for the status change in the CodeBuild build job. + +The EventBridge rule was configured to send the event to an endpoint in our NextJS app, which stored the details in DynamoDB. The team decided to use DynamoDB as the source of truth for the CodeBuild build status so that if the NextJS app went down, the status of the child apps wouldn’t be lost. + +Talking of the app going down, we needed to think about redundancy, and what happens if the app goes down whilst a child app build is running. EventBridge can use SQS as a dead letter queue (DLQ); if the endpoint doesn’t return a 200 response, the event is sent to an SQS queue. After a glance at [SDK](https://docs.aws.amazon.com/AWSJavaScriptSDK/v3/latest/introduction/), it seemed we’d be able to get the events from the DLQ - happy days! (Side note, how many [TLAs](https://en.wikipedia.org/wiki/Three-letter_acronym) can I fit into this blog post?) + +![IAP EventBridge Architecture]({{ site.github.url }}/bpritchard/assets/codebuild/arch-eventbridge.png "IAP EventBridge Architecture") + +### The issue and the lesson +When implementing the DLQ logic, we discovered that the SQS `ReceiveMessageCommand` only gets up to 10 messages from the queue. If there are less than 10 messages, you may only get a sample of them, for example, if you have 6 messages in the queue, you may receive 3. To ensure you get all of the messages from the queue, you have to repeat the request… and reimplement the original polling, this time with extra AWS resources and steps. + +A small glimmer of hope appeared when we saw that there is a ‘first-in, first-out’ (FIFO) queue type available, which quickly died when we discovered that a FIFO queue could not be configured as the DLQ for SQS + +Although the chance of a user deploying a child app, and then the app going down before the response can be sent, was pretty small, especially for the internal once-in-a-while use case of the IAP, it was still causing concern and a reason to go back to the drawing board. + +There’s a lesson here though - don’t have a glance at SDK and just assume it will work how you think it does! + +## Step Functions + +To truly understand where we were going with the event-driven architecture and come up with an alternative solution, we took a step back and had a look at what we required. These were: + + 1. The user can load the current state of the child apps + 2. The user is updated "instantly" when the child app state changes + +After picking the brains of some clever consultants in the company, the team started to look at AWS Step Functions. + +[Step Functions](https://docs.aws.amazon.com/step-functions/latest/dg/welcome.html) is an AWS service that provides serverless orchestration for other AWS services. A workflow, known as a state machine, runs a series of event-driven steps. These steps are generally tasks done by other AWS services through the AWS API, but Step Functions can also call out to external APIs. A state machine allows users to chain multiple lambdas and API calls together whilst preserving the inputs and outputs. The state machines are written in JSON format, but Step Functions has an easy-to-use UI to drag and drop steps. + +For example, you may have a state machine that is triggered when a file is added to an S3 bucket, which then runs a series of lambdas to manipulate the file in the bucket, and then sends a notification to a user; whilst you could do this flow without Step Functions, the UI makes the flow easy to visualise, the state machine preserves inputs and outputs, and just makes the whole process a bit nicer. + +![Simple State Machine]({{ site.github.url }}/bpritchard/assets/codebuild/sf-simple.png "Simple State Machine") + + +### Two steps forward… +A few investigations and proofs-of-concept later, the team had ideas for using Step Functions instead of the EventBridge/SQS architecture, as well as cost estimations that showed that the two designs had very similar (minimal) costs. + +The initial idea was to use Step Functions for the full lifecycle of a child app. Instead of the NextJS admin app sending a request to CodeBuild to start a build, the admin app would trigger a Step Functions state machine. The state machine would trigger the CodeBuild job to deploy the child app, Step Functions would wait for the user to request to remove the child app, and then trigger the destroy job. + +Like the example S3 workflow above, we can already do this basic CodeBuild deploy/destroy without Step Functions. We thought Step Functions would give us a few nice things over the then-current solution: + + 1. Easier to add nightly shut-downs of the child apps + 2. Easier to add health checks for the child apps to ensure they are fully deployed before giving the URL to the user + 3. Able to use Step Functions as the source of truth for the app build status, rather than DynamoDB + +Our rough idea for the state machine was something like this: + +![PoC State Machine]({{ site.github.url }}/bpritchard/assets/codebuild/sf-poc.png "PoC State Machine") + + + +### … One step back + +Of course, nothing is that simple, and we discovered problems with our ideas. + +We first investigated how to handle user input with Step Functions - the answer here is callbacks and task tokens. If you set a lambda function in the state machine to ‘wait for callback’, the step will generate a task token in the lambda’s context object, and wait for that task token to be returned. The task token can be returned in a few ways, including webhook listeners, but the simplest way is using the API. + +~~~ bash +aws stepfunctions send-task-success --task-token YOUR_TASK_TOKEN +~~~ + +When the task token is returned, the state machine resumes - in our case, this would be used to wait for the user to trigger the destroy on the admin app, and then run the CodeBuild build job to destroy the infrastructure. + +One of the main benefits we thought Step Functions would have is using Step Functions as the source of truth for the state of the child apps. This would mean we could eliminate a complex flow and reduce the amount of logic in our NextJS app. This was possible in theory, as we could use the AWS API to get the state from each step as needed, reducing our need to handle and store it. In practice, however, when we looked into using callbacks and task tokens, we realised we’d need to store the task token, which means we’d still need our DB. + +No doubt there is a better architecture for us to use Step Functions for the full child app lifecycle, but that would involve refactoring. Having weighed up the pros and cons, and keeping in mind that we were almost at MVP and ready for our first release, we decided to park the idea of having the full child app lifecycle on Step Functions and focus on a smaller area - a nightly shutdown of the child apps. + +The use case for the child apps, for MVP at least, is for interviews, so they will not need to run overnight. There’s an environmental and monetary cost to leaving apps deployed when they aren’t required, so being able to shut them down automatically is a good feature to add. + +The first design for our nightly shutdown used an `ECS:List Clusters` step to get the running containers and then shut them down using the CodeBuild build job. + +This brings us to the second big issue we found with using Step Functions for our app and infrastructure. For our CodeBuild build job to run, we need to give it a `host-name` variable, which is used for the ephemeral URL generated for the child app, as well as the names of the AWS resources deployed for that child app. The host names are generated from three words along with the name of the deployed child app, so they can end up being long. As AWS limits the number of characters in names, the host names are truncated when used for the AWS resource names. This posed a problem when we tried to get the host name from the outputs of the ECS step in the state machine - there was no easy way to get the full name needed for the variable, so we’d have to get it from the database. + +A redesign brought us to querying DynamoDB to get the host names. This solution has been implemented, and so far, successfully destroying any child apps left up at the end of the working day. + +![Nightly Shutdown State Machine]({{ site.github.url }}/bpritchard/assets/codebuild/sf-nightly-shutdown.png "Nightly Shutdown State Machine") + +## Conclusion +As we have reached MVP and done a release, the team is continuing the investigations into Step Functions and getting the full child-app lifecycle managed by a state machine. + +In my opinion, Step Functions is a cool AWS service that is worth spending time looking at. It can do the heavy lifting and make life easier; I’m sure that if the team knew about it earlier, and we had had the benefit of a team that wasn’t constantly changing and making consistent architectural decisions harder, we could have designed our app so that Step Functions and Lambdas do a lot more of the backend work, taking logic out of the NextJS app. + +The Internal App Project is an interesting bench project, and I’ve learned a lot! + diff --git a/_posts/2024-04-22-going-from-2.4-to-3.5.md b/_posts/2024-04-22-going-from-2.4-to-3.5.md new file mode 100644 index 0000000000..d55371d900 --- /dev/null +++ b/_posts/2024-04-22-going-from-2.4-to-3.5.md @@ -0,0 +1,363 @@ +--- +title: Apache Spark - What does going from 2.4 to 3.5 get you? +date: 2024-04-22 00:00:00 Z +categories: +- Data Engineering +tags: +- data engineering +- apache spark +summary: We look at what has changed between Apache Spark 2.4.x and 3.5.1, describing + some of the new functionality and the significant boost in performance . +author: sconway +contributors: mmorgan +image: "/uploads/Apache%20spark%20thumbnail.png" +layout: default_post +--- + +Apache Spark has now reached version 3.5.1, but what if you are still using a 2.4.x +version? 2.4.8 went out of support in May 2021, so upgrading is strongly advised. + +If you go through the pain of updating to the latest version, what do you gain? + +- Apache Spark SQL has gone through a major evolution, now supporting ANSI SQL, and + adding many new features and making many performance improvements. + +- A great deal of new functionality has been added in the Python and PySpark areas. In + particular, Pandas API on Spark gives you a tuned distributed version of pandas in the + Spark environment. + +- Streaming has gained a number of functional and performance enhancements. + +- The addition of support for NumPy and PyTorch aids machine learning tasks and the + distributed training of deep learning models. + +# Improvements to Apache Spark SQL + +Apache Spark’s SQL functionality is a major part of the platform, being one of the two +main ways of manipulating data. The 3.x releases have seen a major shift to support ANSI +SQL, with even Spark specific SQL being aligned as closely as possible with ANSI +standards. + +Performance has been enhanced with a new +[Adaptive Query Execution (AQE)](https://www.databricks.com/blog/2020/05/29/adaptive-query-execution-speeding-up-spark-sql-at-runtime.html) +framework. +Features include better execution plans with re-optimization based on runtime statistics +and optimization of query planning. There are better adaptive optimizations of shuffle +partitions, join strategies and skew joins. Dynamic partition pruning removes unused +partitions from joins, reducing the volume of data. Finally, Parquet complex types, such +as [lists, maps, and arrays](https://spark.apache.org/docs/latest/sql-ref-datatypes.html) +are now supported. + +Various optimizations push +[filtering of data out to data sources](https://medium.com/@deepa.account/apache-spark-and-predicate-pushdown-f6a41d53bef5) +or earlier in the pipeline, reducing the amount of data scanned and processed in Apache +Spark. For instance, injecting Bloom filters and pushing them down the query plan, gave +a claimed ~10x speedup on a TPC-DS benchmark for untuned Delta Lake sources and Parquet +files. + +![Bloom filter pushdown 10x performance improvement]({{ site.github.url }}/sconway/assets/bloom-filter-pushdown-10x-perf.png "Bloom filter pushdown 10x performance improvement") +([source](https://www.databricks.com/blog/2022/06/15/introducing-apache-spark-3-3-for-databricks-runtime-11-0.html)) + +There are a large number of improvements to support developers and increase +functionality. SQL plans are presented in a simpler and structured format, aiding +interpretability. Error handling has been improved, with runtime errors returned instead +of NULLs, and with explicit +[error classes](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/errors/package-summary.html) +indicating the type and location of the error. In addition, errors now contain industry standard [SQLSTATE error codes](https://spark.apache.org/docs/latest/sql-error-conditions-sqlstates.html). + +Developers are aided by better compile time checks on type casting. Parameterized SQL +queries make queries more reusable and more secure. New operations such as +[UNPIVOT](https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-unpivot.html) +and [OFFSET](https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-offset.html) +have been added along with new linear regression and statistical functions. In addition, FROM clauses can now use Table-valued Functions (TVF) and User Defined Functions (UDF), enhancing the capabilities of SQL syntax. Over 150 SQL functions have been added to the Scala, Python and R APIs, removing the need to specify them using error-prone string literals. + +# Enhancements to Python and PySpark functionality + +Python and PySpark have been a major focus of Apache Spark development. In V3.1, +[Project Zen](https://www.databricks.com/blog/2020/09/04/an-update-on-project-zen-improving-apache-spark-for-python-users.html) +began an ongoing process of making PySpark more usable, more pythonic, and more +interoperable with other libraries. The developer experience has been enhanced with +better type hints and autocompletion, improved error handling and error messages, +additional +[dependency management](https://www.databricks.com/blog/2020/12/22/how-to-manage-python-dependencies-in-pyspark.html) +options (with Conda, virtualenv, and Pex supported) and better documentation. + +Possibly the most significant change is with the +[Pandas API on Spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html), +added to Spark V3.2. This is a distributed re-implementation of the pandas data analysis +library, allowing the workload to be delegated across multiple nodes instead of being +restricted to a single machine. This provides improved performance, with execution speed +scaling nearly linearly with cluster size. Each new version has brought further +optimizations. + +As the 3.x versions have progressed, the Pandas API on Spark has implemented more and +more of the full pandas API (though if required a Pandas API on Spark Dataframe can be +converted to a pandas DataFrame, at the cost of being restricted to a single machine’s +processing power and memory). One difference from pandas is the use of +[plotly](https://plotly.com/python/) +for interactive data visualization, instead of static visualization with matplotlib. + +New +[DataFrame equality test functions](https://www.databricks.com/blog/simplify-pyspark-testing-dataframe-equality-functions), +with detailed error messages indicating diffs, help with ensuring code quality. A +profiler for Python and pandas User Defined Functions helps with fixing performance and +memory issues. + +NumPy support brings powerful and efficient array functionality to Apache Spark, +something of particular use to ML users. + +# Streaming advances + +Streaming functionality has been enhanced, with stream-stream joins being filled out +with full outer and left semi joins and with new streaming table APIs. +[RocksDB is now used for state stores](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation), +giving better scalability as size is no longer limited to the heap size of the +executors. In addition, fine-grained memory management enables a cap on total memory +usage across RocksDB instances in an executor process. + +Stateful operations ( +[aggregation](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#basic-operations---selection-projection-aggregation), +[deduplication](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#streaming-deduplication), +[stream-stream joins](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#stream-stream-joins) +) can now be used multiple times in the same query, including chained time window +aggregations. This removes the need to create multiple queries with intermediate +storage, reducing cost and increasing performance. In addition, stateful processing +functions can now be defined using Python, not just Java or Scala. + +Various enhancements give improved performance, including +[native Protobuf support](https://spark.apache.org/docs/latest/sql-data-sources-protobuf.html). + +# Machine learning and AI gain support + +Apache Spark adding increased support for machine learning and AI is to be expected, +both because of the current buzz, but also because they are a natural fit to Spark’s +functionality. + +[TorchDistributor](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.torch.distributor.TorchDistributor.html) +provides native support in PySpark for PyTorch, which enables distributed training of +deep learning models on Spark clusters. It initiates the PyTorch processes and delegates +the management of distribution mechanisms to PyTorch, overseeing the coordination of +these processes. + +![TorchDistributor]({{ site.github.url }}/sconway/assets/torchdistributor.png "TorchDistributor") +([source](https://www.databricks.com/blog/2023/04/20/pytorch-databricks-introducing-spark-pytorch-distributor.html)) + +TorchDistributor is simple to use, with a few main parameters to consider: + +
+
+
+
+from pyspark.ml.torch.distributor import TorchDistributor
+model = TorchDistributor(
+	num_processes=2,
+	local_mode=True,
+	use_gpu=True,
+).run(<function_or_script>, <args>)
+
+
+
+
+ +Keeping up with the trend for LLM supported coding, the English SDK for Apache Spark +takes English instructions and compiles them into PySpark objects such as DataFrames. +Generative AI transforms the English instructions into PySpark code. + +For example: + +
+
+
+
+best_selling_df = df.ai.transform("What are the best-selling and the second best-selling products in every category?")
+
+
+
+
+ +# Spark Connect + +Spark Connect, introduced in v3.4.0, is a protocol designed to streamline communication +with Spark Drivers. + +The Spark Connect client library translates DataFrame operations into unresolved logical +query plans which are encoded using protocol buffers. These are sent to the server using +the gRPC framework. + +Here’s how Spark Connect works at a high level:
+([from the official documentation found here](https://spark.apache.org/spark-connect/)) + +- A connection is established between the Client and Spark Server +- The Client converts a DataFrame query to an unresolved logical plan that describes the + intent of the operation rather than how it should be executed +- The unresolved logical plan is encoded and sent to the Spark Server +- The Spark Server optimises and executes the query +- The Spark Server sends the results back to the Client + +![Spark Connect communication]({{ site.github.url }}/sconway/assets/spark-connect-communication.png "Spark Connect communication") +([source](https://spark.apache.org/docs/3.5.1/spark-connect-overview.html#content)) + +# Other areas of improvement + +The Apache Spark operations experience has been upgraded through new UIs, for instance +for Structured Streaming, more observable metrics, aggregate statistics for streaming +query jobs, detailed statistics about streaming queries, and more. + +We did not find any benchmarks directly comparing performance between Apache Spark 2.4.x +and 3.5.x. However, we did find +[this post by DataMonad](https://www.datamonad.com/post/2022-04-01-spark-hive-performance-1.4/) +running a +[TPC-DS benchmark](https://www.tpc.org/tpcds/), +which shows significant speedups going from Spark 2.4.8 and 3.2.1, particularly when +running concurrent queries. + +In a sequential test, Spark 3.2.1 performed about twice as fast as Spark 2.4.8. + +![Spark 2.4.8 vs 3.2.1 sequential benchmark]({{ site.github.url }}/sconway/assets/spark.2.3.vs.3.2.1.sequential.png "Spark 2.4.8 vs 3.2.1 sequential benchmark") +([source](https://www.datamonad.com/post/2022-04-01-spark-hive-performance-1.4/)) + +For a parallel test, Spark 3.2.1 ran up to 16 times faster than Spark 2.4.8. + +![Spark 2.4.8 vs 3.2.1 concurrent benchmark]({{ site.github.url }}/sconway/assets/spark.2.3.vs.3.2.1.concurrent.png "Spark 2.4.8 vs 3.2.1 concurrent benchmark") +([source](https://www.datamonad.com/post/2022-04-01-spark-hive-performance-1.4/)) + +# Cloud Platform Support + +If you are running your own Apache Spark cluster, either on-premise or in a cloud hosted VM, then you can choose whatever version is available to you. However, if you want to run in a more Spark-as-a-Service mode, what do the major cloud providers offer you? + +Amazon claims that their +[EMR runtime](https://aws.amazon.com/emr/) +for Apache Spark is up to +[three times faster](https://aws.amazon.com/blogs/big-data/run-apache-spark-workloads-3-5-times-faster-with-amazon-emr-6-9/) +than clusters not using EMR. However, if you choose EMR, you get the single version of Apache Spark supported in that environment, which is currently 3.5.0. + +[Microsoft Azure HDInsight](https://azure.microsoft.com/en-gb/products/hdinsight) +is more trailing edge than leading edge. HDInsight V4.0 provides Apache Spark V2.4, whose basic support ended in February of 2024. HDInsight V5.0 supports Apache Spark V3.1.3, which was released in February of 2022. Meanwhile, HDInsight V5.1 supports the more recent Apache Spark V3.3.1, released in October of 2022. + +Google offers +[GCP Dataproc](https://cloud.google.com/dataproc), +with serverless Spark support. The oldest supported Apache Spark runtime is V3.3.2, +with the default being V3.3.4. The latest V3.5.1 is also offered. + +An alternative, though possibly more expensive route, is to use +[Databricks](https://www.databricks.com/)’ +offerings on the main cloud providers. These give you a choice of the most recent +versions of Apache Spark. + +# Deprecations + +Here are the library versions and runtime changes that have occurred since Spark v2.4.6. + + + + + + + + + + + + + + + + + + + + +
3.0.0 +
    +
  • Deprecate Python 2 support
  • +
  • Deprecate R < 3.4 support
  • +
  • Deprecate UserDefinedAggregateFunction
  • +
+
3.1.1 +
    +
  • Drop Python 2.7, 3.4 and 3.5
  • +
  • Drop R < 3.5 support
  • +
  • Remove hive-1.2 distribution
  • +
  • Remove references to org.spark-project.hive
  • +
  • Deprecate support of multiple workers on the same host in Standalone
  • +
+
3.2.0 +
    +
  • Deprecate spark.launcher.childConectionTimeout
  • +
  • Deprecate GROUP BY GROUPING SETS and promote GROUP BY GROUPING SETS
  • +
  • Deprecate Python 3.6 in Spark documentation
  • +
  • Deprecate ps.broadcast API
  • +
  • Deprecate the num_files argument
  • +
  • Deprecate DataFrame.to_spark_io
  • +
+
3.5.0 + Upcoming Removals
+ The following features will be removed in the next Spark major release +
    +
  • Support for Java 8 and Java 11, and the minimal supported Java version will be Java 17
  • +
  • Support for Scala 2.12, and the minimal supported Scala version will be 2.13
  • +
+
+ +# Resolved Issues + +Here are the resolved issues that have been implemented for each minor release since Spark v2.4.6. + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
3.0.03,400 tickets resolved
3.1.11,500 tickets resolved
3.2.01,700 tickets resolved
3.3.01,600 tickets resolved
3.4.02,600 tickets resolved
3.5.01,300 tickets resolved
+ +# To Conclude + +Migrating software to newer versions is always daunting, but moving on from out of +support versions is a crucial process. + +As seen above, there are relatively few deprecations. A +[migration guide](https://spark.apache.org/docs/latest/sql-migration-guide.html) +is provided for Apache Spark and includes a number of configuration switches to retain legacy behaviours, easing the migration process. + +In return for the pain, you gain access to a great deal of new functionality, in all +areas, and significant performance improvements. + +# More Information + +The latest Apache Spark documentation can be found +[here](https://spark.apache.org/docs/latest/).
+The documentation for the last 2.x version (2.4.8) is [here](https://spark.apache.org/docs/2.4.8/).
+Release notes can be accessed from the +[Apache Spark News index](https://spark.apache.org/news/index.html).
+The Apache Spark migration guide is provided +[here](https://spark.apache.org/docs/latest/sql-migration-guide.html). + +Databricks has useful summaries of each Apache Spark 3.x version.
+[Apache Spark 3.0](https://www.databricks.com/blog/2020/06/18/introducing-apache-spark-3-0-now-available-in-databricks-runtime-7-0.html)
+[Apache Spark 3.1](https://www.databricks.com/blog/2021/03/02/introducing-apache-spark-3-1.html)
+[Apache Spark 3.2](https://www.databricks.com/blog/2021/10/19/introducing-apache-spark-3-2.html)
+[Apache Spark 3.3](https://www.databricks.com/blog/2022/06/15/introducing-apache-spark-3-3-for-databricks-runtime-11-0.html)
+[Apache Spark 3.4](https://www.databricks.com/blog/2023/04/14/introducing-apache-sparktm-34-databricks-runtime-130.html)
+[Apache Spark 3.5](https://www.databricks.com/blog/introducing-apache-sparktm-35) diff --git a/_posts/2024-04-22-message_types.md b/_posts/2024-04-22-message_types.md new file mode 100644 index 0000000000..16b028a65d --- /dev/null +++ b/_posts/2024-04-22-message_types.md @@ -0,0 +1,290 @@ +--- +title: Async APIs - don't confuse your events, commands and state +date: 2024-04-22 09:00:00 Z +categories: +- Data Engineering +tags: +- Data Strategy +- Queue +- Event +- State Transfer +- Async command +summary: This blog is about the different types of message you can put on systems + like Rabbit MQ and Kafka. It discusses the differences between commands, events, + state and gives a few tips around how to structure your messages. +author: dhope +image: "/uploads/Async%20APIs.png" +--- + +Image of some services,a message broker and messages going between them and two arrows. The arrow pointing at the message broker saying 'previous blog' and the arrow pointing at the mesages saying 'this blog' + +In my previous [blog post](https://blog.scottlogic.com/2023/11/14/data-distribution.html) I looked at various technologies for sending data asynchronously between services including RabbitMQ, Kafka, AWS EventBridge. This time round I'll look at the messages themselves which over the last few years I've found to be a more complex and nuanced topic than expected. + +To set the scene see the diagram below of an imaginary financial trading application: + +Diagram showing five services: trade execution, order, data warehouse, payments, prices and various messages going between them like the command Take Payment, state update Trade Details, event Order placed and time series data Stock Price at T=1,2,3 + +There's lots of data flying around varying from real time pricing data to instructions to execute trades. I've coloured the data entities according to their types and we see there's a few different patterns like events and state which we'll discuss in a moment. + + The data bus isn't shown in the diagram because the discussion in this blog is relatively independent of which you pick. You might imagine, for example, that those with "event" in the title like Azure Event Grid and AWS EventBridge are only for events but the reality is that most data buses support payloads of 256kB or more meaning you can be flexible in what you send in any technology. + + +## Terminology and types of data +To begin I think it's useful to classify the different types of messages we might send before going into more detail on each in the following sections. The obvious ones are: + + + + + + + + + + + + + + + + + + + + + + +
TypeExample
Events
    +
  • User Created (ID=55)
  • +
  • ECS instance started (ID=353)
  • +
Commands/Instructions/Request
    +
  • Send a reset password email to user X
  • +
  • Take payment for £x from user Y
  • +
State
    +
  • User (complete object)
  • +
  • Product (complete object)
  • +
  • Order (complete object)
  • +
Time series values
    +
  • Stock price
  • +
  • Error rate metric on API
  • +
+ +Time series data is of course the state of a single thing, but I'd argue that it is distinct because of its periodic nature - an update is sent regardless of whether it has changed or not. + +In the following sections we'll dive into some of these in more detail but let's discuss one more piece of terminology: *messages*. + +Documentation, blogs and books sometimes talk about messages when referring to the use of queues/data buses. I'll use the term *message* in this blog as a generic term for asynchronous communication that may be state or commands or events. + +## Events vs State +An event says that something has happened or changed, e.g. "Account Created". +Ignoring timestamps and metadata it might have a payload like: + +
+
+{
+  "EventType": "ACCOUNT_CREATED"
+  "AccountID": "8c0fd83f-ff3f-4e0e-af4b-2b7470334efa"
+}
+
+
+ +If you want to know about the details of the particular account then you need to get it by some other route like an HTTP request to an accounts REST API or whatever interface is in place. + +*State* on the other hand contains the full state for whatever entity has been created or changed. e.g. + +
+
+{
+  "EntityType": "ACCOUNT"
+  "ID": "8c0fd83f-ff3f-4e0e-af4b-2b7470334efa",
+  "Name": "David",
+  "Email": "An email@domain.com",
+  "Tel: {
+    "Type: "Mobile"
+    "Country Code": "44"
+    "777777777"
+  }
+  .....etc
+}
+
+
+ +In this example I haven't included a field to say if it is created or updated. A downstream doesn't necessarily care if they saw an earlier message, they'll just check if they already have the particular entity or not. We are sending state, not what happened like creation or an update. The exception is deletions that need some special treatment, e.g. a special message type or an empty payload to signify the state is gone. + +State messages can be used in lots of scenarios but are a necessity when going towards an event sourcing route with the event log as the source of truth rather than a database. + +I've found in practice that the difference between state and events can be a bit greyer than suggested so far. You may encounter halfway solutions where an event has some commonly used info, like the email, but not the more detailed information. It's not very pure but it saves a lot of API requests for consumers who only care about the email. +In a similar vein, sometimes an event is conveying a change in just one field, e.g. a "Phone Number Changed" event and includes the phone number as well as the user ID and so carries all the state. Sometimes a state message may include before and after state or a change list with the field names/paths (e.g. changes=[firstname, person.phone.mobile]) that have changed. + +The following table summarises the differences: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FeatureEventState
PayloadAn ID and enum to say what happenedA full data entity
Event typeEnum identifying a specific thing that happened like EMAIL_UPDATEDEntity name like PROFILE
Followup callsYes, API call neededNo
UpdatesIdentified by event name but no way to see old valueTypically not conveyed if a state is new or update but sometimes a message will have old and new state
Message sizeSmallMedium or Large
+ +### Tradeoffs +I wouldn't say there's a right or wrong option as to which to go with. It will depend on a few things: + +#### Number of consumers +A sketch showing a producer service sending out a message that fans to many consumers who all then do a REST call back into the producer to get more data +As the consumer volume goes up the advantage of the stateful approach is that you don't end up with heavy load on an API. Imagine a cluster of 100 messages going onto a bus arriving at 15 consumers at the same time. You've then got 1500 requests in a second or two to your API. + +#### Resilience +Relating to the number of consumers, if your API is not that reliable then the stateful option can be better for resilience because you don't have a dependency on both the message bus and the API, just on the message bus in order to get all the data. + +#### Coupling +Resilience and some of the other points are really a form of coupling. If a service must call another service's API to get data it is more closely coupled to that service than a state message solution where the consumer needs to know nothing about the producer and isn't dependent on its name, resilience, API schema etc. + +#### Data Transfer Volumes +If most consumers only want 2 or 3 fields but the state messages have 200 fields in them it can be wasteful. In this case an event option will be more efficient assuming the synchronous APIs (e.g. REST, GraphQL) are finer grained. It's not a major plus for small focused state objects (e.g. 10-20 fields) but more important if sending large chunks of data around going into the 10s of KBs. + +#### Consumer simplicity + +Sometimes I've heard people assert that a state message is simpler because there's no need to make a call to an API which is true. But... it isn't always so straightforward. This is best explained with an example. Consider the following scenario: + + * the data of interest is a user's account detail + * you want to send an email or SMS to the user for security reasons when properties on the account change such as the email address change + * you've taken the state message approach + * you don't include a change list, just the current state. + + Any service that is responsible for sending the email or SMS has to have its own state so it can compare before and after values and see that the email changed rather than some other field like name. + +On the other hand, if you had a single event saying "email changed" (with the new email on the event or available via API) then the processing service can be stateless. + +In this case the consumer is actually much simpler with events but the problem for state messages can be negated by including a change list. + +#### Schema management +You have to keep 2 schemas in sync for the stateful approach (REST and message) and a lot of messaging systems don't have good support for schema management compared to API framework s. +This is true both in terms of providing a schema for developers, e.g. via Swagger/Open API but also in terms of enforcing what producers are allowed to write onto a queue. + +#### Aggregations +This is addressed in a lot more detail in the upcoming Part 2 of this blog. +For now we'll note that if a service needs a few entities to do its work (that would typically arrive via multiple state messages) then an event model can be simpler. The consumer receives 1 event and then immediately makes a few REST calls or a single Graph QL call to get the required entities to proceed. + +With the state approach you potentially have to handle out of order messages and wait for all messages before you can proceed. Alternatively, a larger aggregation state message with all the entities must be constructed which has its own problems. + +### Instructions/commands/requests +An instruction or command is a request to "do X". As an example, think about a commerce site or government service that delivers a product by post or courier once a payment is made. Asynchronously this may work 2 ways: + + 1. there's a delivery microservice that is listening to generic ORDER_PLACED events (or order state) and acts upon them to arrange delivery. + 2. The order application (or an intermediary microservice service that consumes the ORDER_PLACED events) writes out a "PREPARE_DELIVERY" instruction or similar to a delivery company service. + +The latter is an example of an instruction. + +The instruction message will typically contain all the necessary information for the downstream to do its work although it doesn't have to. Generally, because an instruction is quite targeted, there is no reason to not have the relevant data in the message unless there are any large files or images needed that are best not transmitted on the message bus. + +### Commands vs state/events + +Now we've looked at instructions let's compare them to state and event messages. I would say the differences are: + + * A state or event message is quite generic and there could be a few services interested in it. + * A command is more specific and targeted at a particular consumer albeit with loose coupling (via queue or similar). + * With a command there is often an expectation of a response back via another message to confirm that it has been received and accepted or acted on. + +My personal take on this is that commands best fit into a workflow where you want to keep coupling low but nonetheless you are requesting something to happen, and you care that it does happen. You may want to be able to bring up on a dashboard the state of the user's order and its delivery, and take action where there are problems. You don't want to have to pull data from numerous systems to get that view. Such a scenario often benefits from an orchestrator, e.g. something like Camunda or Uber Cadence or AWS Step Functions. + +With events/state messages then the source system (or an orchestrator) doesn't take any responsibility for what happens when it has done its work. It just throws out a message saying "here's some new/updated data" and moves on. It's up to other services to decide what to do and to provide a view on the status of the downstream actions. An obvious corollary of this is that where transmitting state, if any critical (to business function) downstreams depend on it then the messaging system must be very robust because there's no opportunity for retries or flagging errors in the source system. The source has no idea if downstreams got the data and successfully processed it. + +## Time series data +I am not going to say a lot here because the question of what to put in the message is a lot more obvious: + +* 1 or more values, +* the value type(s) +* a timestamp. + +The challenges are primarily around the message bus and the consumer. e.g. working out when all data has arrived in a given time period (See watermarks in [Streaming Systems](https://www.oreilly.com/radar/the-world-beyond-batch-streaming-102/)) and finding the right balance between risk of data loss vs throughput and latency. But the question of what to put in the message itself is comparatively simple. + +## Message envelopes + +simple sketch of inner and outer boxes with the outer labelled as the message envelope and the inner as the message + +As well as the contents of the payload some thought should be given to the message metadata that will make up the standard envelope for all your messages. +A few recommendations are: + +### IDs + Include a unique ID on a message regardless of whether it's state, command etc. I'd advise UUIDs to guarantee uniqueness. This ID should just be about the message and not the entity. This is useful because: + + * for a command an action may not be idempotent, e.g. sending an email is not idempotent and so you must be able to de-duplicate + * even for state which ideally is idempotent, it's better to avoid duplicating work in consumers and so having an ID to check against makes this easy. + + +### Timestamps +Include a timestamp in a standard UTC form so that a consumer can reorder messages and be clear about what the timestamp means. I'd recommend this being based on the entity being written to the source database (where applicable) or processed, not the message send time which in a threaded system may be non-deterministic. + +On the format, it's debatable but I prefer string versions of the timestamp as it makes debugging easier without having to convert epoch values. e.g. 2024-02-19T12:18:07.000Z, not 1708344420000. + +### Versioning +Have a plan for versioning which could be in version field or be part of the message type name if you want to be able to route different versions to different consumers easily. + +Don't confuse the version for the message envelope (shared across many entities) and the version for the specific entity. It's fine to have 2 version numbers, one for each. + +### Testing and environments +It's worth allowing for testing and multiple environments in your messages. +For example, consider a flag to say if a message is a test message. This will allow easy filtering of test data in production without polluting your analytics systems. + +Also consider an environment flag. It is common to flow production data into test environments to help provide realistic data. Sometimes you'll want to know about this because, as the data came from production, referenced IDs won't exist. A flag lets you know this came from another environment and not all linked data may have flowed into that test environment. + +### Example +As an example of a message with the above fields: + +
+
+ {
+  "messageID": "cc7b9901-c339-4c7d-80cd-c400f20581fd"
+  "timestamp": "2024-02-19T12:18:07.000Z"
+  "entityType": "ACCOUNT",
+  "envelopeVersion": 1,
+  "isTest": true,
+  "fromEnvironment": "prod"
+  "payload": {
+    "version" = 1,
+    "accountID": "0a0ebe8d-e48a-4195-8372-4f54c5dfd4e5",
+  }
+ }
+
+
+ +## Final thoughts + +We've been through some of the pros and cons of events vs state and also looked at commands, observing the latter are often used in a workflow where you care about the receipt of that instruction and want to know the state of the action off the back of it. + +On state and events specifically, I'm not sure there's ever a 100% preferred approach just tradeoffs dependent on the number of consumers, the relationships between your data entities. If I must leave the fence, all I'll say is that state messages have often proved more complex than expected so I lean very slightly towards events, all else being equal. A few reasons being: + +* only one API for getting the data - don't need to keep 2 in sync +* consumers don't have to assemble objects turning up in random order +* one source of truth accessible via the one API +* no need to worry about replays and backfills - just grab historic data from the REST/GraphQL/RPC API. + + Nonetheless events do mean tighter coupling between services and won't always scale if consumer numbers are high. + +Whatever you go for, have a clear plan, try to be consistent and logical and don't make a choice accidentally. Put another way don't randomly mix instructions, state and events within a service without any clear reasoning. This doesn't mean you should try and have a one size fits all enterprise-wide pattern. Even in a single domain it may be fine to have one service emitting state and another service listening to that and sending commands to do specific things when the data changes. + +In part 2 I'll go into more detail on state messages looking at how to pick the right granularity for the data. diff --git a/_posts/2024-04-23-are-we-ready-to-put-ai-in-the-hands-of-business-users.md b/_posts/2024-04-23-are-we-ready-to-put-ai-in-the-hands-of-business-users.md new file mode 100644 index 0000000000..c37a461759 --- /dev/null +++ b/_posts/2024-04-23-are-we-ready-to-put-ai-in-the-hands-of-business-users.md @@ -0,0 +1,77 @@ +--- +title: Are we ready to put AI in the hands of business users? +date: 2024-04-23 00:00:00 Z +categories: +- Data Engineering +tags: +- data +- bi +- business intelligence +- data visualisation +- ai +- artificial intelligence +- ml +- machine learning +- generative ai +- generative bi +- aws +- cloud +- quicksight +- sagemaker +- sagemaker canvas +summary: Lots of businesses want to use AI, if they can find the right business case + for it. We look at some new and enhanced AWS products which take a low-or-no-code + approach to using AI to enhance Business Intelligence tools. +author: csalt +contributors: sgladstone +image: "/uploads/are%20we%20ready%20to%20put%20ai%20in%20the%20hands%20of%20business%20users.png" +layout: default_post +--- + +Generative AI has been grabbing headlines, but many businesses are starting to feel left-behind. Large-model AI is becoming more and more influential in the market, and with the well-known tech giants starting to introduce easy-access AI stacks, a lot of businesses are left feeling that although there may be a use for AI in their business, they're unable to see what use cases it might help them with. They see the potential of AI, but struggle to find practical applications for it. Consumer chatbots are now a common feature of company home pages, but aren't always popular with end users. With little other use of AI in the business, some companies are starting to feel that they're missing out from the benefits of AI whilst at the same time being not quite sure what they're missing out on. + +However, at the same time, it is becoming more and more common to find AI-driven solutions to specific, well-contextualised business problems. + +Several of the tech giants are quietly developing powerful and user-friendly tools to do just that; we recently attended a workshop run by Amazon, to show off some of their products in this area and get hands-on experience of some of AWS's preview capabilities. These are undoubtedly very powerful, and moreover position AI firmly as a tool to help business, a tool that delivers real value for its (admittedly expensive) running costs. Amazon's goal in this space is to develop AI tools that are able to be used by non-technical users, so that organisations are not hamstrung by any bottlenecks caused by the number of data scientists they have available. We're not sure they have completely achieved it, but they have certainly created something that a non-specialist can pick up and potentially get results with very quickly. + +## Model training with SageMaker Canvas + +One of the first steps in using AI to analyse your own data is to generate a model based upon it, and in AWS, this is done using Amazon SageMaker. In particular, with SageMaker Canvas, it's possible to create a machine learning model entirely graphically. You can directly upload a data set, or it can come through some cort of ingestion pipeline using an ETL tool such as Amazon Glue. You can select which algorithm(s) to use to train the model; but then all you need to do is wait. Once the model has been trained, you can study its accuracy and its predictions in the SageMaker UI. + +![Screenshot of AWS SageMaker Canvas, with the accuracy of a prediction shown using a Sankey diagram]({{ site.github.url }}/csalt/assets/aws-ai-and-bi/aws-sagemaker-canvas-1.png "Screenshot of AWS SageMaker Canvas, with the accuracy of a prediction shown using a Sankey diagram") + +The interface uses a Sankey diagram to display the false positives and false negatives of each predicted outcome in an intuitive and easy-to-grasp manner. + +SageMaker Canvas also has a number of other features, including the ability to take one of the well-known large language models already produced and use it as a "foundation model", customising the model using your own data or documents. This is particularly useful when wanting to create a chatbot using retrival-augmented generation (RAG), and SageMaker Canvas lets you configure multiple models to return replies from the same document corpus, then start a simultaneous chat session with each model in parallel, sending the same questions to each. Just a few minutes with this gives you a really fascinating insight into the comparative performance of different LLMs. + +![Screenshot of AWS SageMaker Canvas showing a parallel chat conversation posing the same questions to 3 different LLMs]({{ site.github.url }}/csalt/assets/aws-ai-and-bi/aws-sagemaker-canvas-2.png "Screenshot of AWS SageMaker Canvas showing a parallel chat conversation posing the same questions to 3 different LLMs") + +One key business benefit from the SageMaker Canvas architecture is that the data you upload and the models you create will stay entirely under the control of your own AWS account, and is not shared with the foundational model's provider in any way. This completely obviates a lot of the concerns that I know many businesses have had around knowing what will happen to your data, not only regarding business sensitivity and commercial secrecy, but also with your legal responsibilities around GDPR and similar legislation elsewhere. Amazon Glue includes built-in transformations which can attempt to redact any personally-identifying information from your data sets, to further minimise any risk; and this redaction can use a consistent hashing algorithm to enable data correlation without personal identification. + +Moreover, with cost always being a worry when it comes to AI model training, removing unnecessary fields from your data not only helps remove biases from your model, it improves your training speed and costs too. One slightly clunky aspect of SageMaker Canvas, if you want to use the Foundation Model feature, is that you have to be granted access to the model(s) in the underlying Amazon Bedrock configuration. This might seem an administrative nuisance, but does enable companies to place sensible restrictions on what users are able to do. + +## Predictive BI insights with Amazon QuickSight + +Amazon QuickSight is AWS's offering in the business intelligence dashboard space. It's been around since 2017, and we don't intend to go into a full review of its features here—only a month ago, Mike Morgan and Steve Conway from our Leeds office published [a comparative review of three cloud BI solutions, including QuickSight]({{ site.github.url }}/2024/03/26/cloud-business-intelligence-a-comparative-analysis.html) here on the Scott Logic blog. However, Amazon are adding the ability to link your QuickSight dashboards to your SageMaker Canvas models, combining the data set your model was trained on with the future data it predicts. + +This is powerful as it allows you to create dashboards that go beyond reporting a reflection of the current state of your data. You can build side-by-side visualisations for the state of your business and ML predictions for future trends. + +![An AWS QuickSight dashboard showing financial loan data, including predicted loan outcomes]({{ site.github.url }}/csalt/assets/aws-ai-and-bi/aws-quicksight-1.png "An AWS QuickSight dashboard showing financial loan data, including predicted loan outcomes") + +## Generative BI? Natural language processing in Amazon QuickSight + +At the workshop, we were able to play with a new feature being added to Amazon QuickSight, which Amazon are calling "Generative BI". When we heard the term, we were a little bit puzzled and almost put off because, after all, your BI data has to be based on solid facts. However, what it means is: Amazon have built their Amazon Q chatbot into QuickSight, so you can use natural language queries to explore your data and visualise it. We were both really impressed with how quickly we could use this to create dashboards. For example, when asked to “forecast loan amount by month”, Q will build you a visual that you can add to your dashboard: + +![Adding different visualisations to a QuickSight dashboard with natural language queries]({{ site.github.url }}/csalt/assets/aws-ai-and-bi/aws-quicksight-2.png "Adding different visualisations to a QuickSight dashboard with natural language queries") + +You can see from those screenshots that it doesn't always get your intent 100% of the time, and its intent recognition can partially depend on you configuring your data set properly: manually adding synonyms for field names, for example. You can also see, though, that it really quickly gives you a good starting point, a set of dashboard visualisations that you can then tweak and finesse—and you can edit visuals using natural language, as well as creating them. There are a few little oddities in what intents it understands—for example, when we were playing with it, it could understand "change this chart to a donut chart" but couldn't understand "change the gross profit segment of the chart to green", but overall, this is a really nifty tool. Right now it's still a preview product, so the UX can be a little clunky and glitchy in parts with slide-out panels occasionally obscuring other controls; we're sure that a lot of these quirks will be resolved before long and everyone will be able to see what a powerful addition to QuickSight this is. + +## Is this AI for the regular business user? + +The goal of these products and features, we were told, was to enable the use of AI, to enhance BI, by business analysts who are domain experts but are not data scientists or software engineers. Have Amazon succeeded? In one sense, we're not the best people to ask about that, because we are software engineers ourselves; we're not the target market. Zero-code, graphically-edited data preparation tools and BI tools are hardly new to the marketplace, either. The real innovation here is building AI seamlessly into the tooling, and giving non-technical users the ability to do their own AI model training. + +However, it's never quite as straightforward as simply giving everyone access to these new features and expecting a sudden revolution in your business. Responsible use is key. Having a plan is key. Unleashing large numbers of employees to train countless models without understanding, say, the difference between the various training algorithms they could choose from, can quickly lead to huge costs, wasted resources, lots of environmental emissions, all for no business benefit. While these tools lower the barrier to entry, technical understanding and technical guidance will still be the key to success. + +We're not convinced these tools will remove the need for data scientists and software engineers in the business, any more than previous generations of zero-code data tools have. What they can do, though, is enable the engineers and specialists to focus on the in-depth engineering, on surfacing data sources and making deeply-buried silos accessible, and engineering systems to unearth that data and move it at scale into a data lake that models can then be trained from. The business team will then be able to use their domain knowledge in combination with AI-enhanced BI tooling to quickly and easily visualise the data and the forecasts that the business needs. Businesses may well want to bring in specialist data engineers at the outset, to work on these data flows and build themselves a platform which will enable their BI platform to grow and evolve. Their BI dashboard can then be developed over time by in-house experts on the data domain, experts who know their data models inside out. + +The products Amazon have been demonstrating are soon going to be matched by other suppliers in the wider marketplace; we suspect a host of other AI data summarisation and analysis tools will appear in the marketplace, and they are likely to be one of the biggest effects that AI has on business in the coming years, breaking down technical silos and realising business value. This is a really exciting field to be working in right now, and as Scott Logic consultants, we're really looking forward to helping build those data pipelines and data flows that business are going to need to unlock the maximum business value from these systems. diff --git a/_uploads/Apache spark thumbnail.png b/_uploads/Apache spark thumbnail.png new file mode 100644 index 0000000000..be1c5b8c0b Binary files /dev/null and b/_uploads/Apache spark thumbnail.png differ diff --git a/_uploads/Async APIs.png b/_uploads/Async APIs.png new file mode 100644 index 0000000000..b12a514335 Binary files /dev/null and b/_uploads/Async APIs.png differ diff --git a/_uploads/BeyondTheHype---green-and-orange---social---E17.jpg b/_uploads/BeyondTheHype---green-and-orange---social---E17.jpg new file mode 100644 index 0000000000..0d2870bcf2 Binary files /dev/null and b/_uploads/BeyondTheHype---green-and-orange---social---E17.jpg differ diff --git a/_uploads/CXOs greenwashing.png b/_uploads/CXOs greenwashing.png new file mode 100644 index 0000000000..c95ca68d20 Binary files /dev/null and b/_uploads/CXOs greenwashing.png differ diff --git a/_uploads/How can CxOs-752a43.png b/_uploads/How can CxOs-752a43.png new file mode 100644 index 0000000000..8a82d6b966 Binary files /dev/null and b/_uploads/How can CxOs-752a43.png differ diff --git a/_uploads/How can CxOs.png b/_uploads/How can CxOs.png new file mode 100644 index 0000000000..8a82d6b966 Binary files /dev/null and b/_uploads/How can CxOs.png differ diff --git a/_uploads/Monitoring AWS thumbnail_.png b/_uploads/Monitoring AWS thumbnail_.png new file mode 100644 index 0000000000..2c8c5b1e0d Binary files /dev/null and b/_uploads/Monitoring AWS thumbnail_.png differ diff --git a/_uploads/Monitoring AWS.png b/_uploads/Monitoring AWS.png new file mode 100644 index 0000000000..e0e9c500d2 Binary files /dev/null and b/_uploads/Monitoring AWS.png differ diff --git a/_uploads/The changing world thumbnail_.png b/_uploads/The changing world thumbnail_.png new file mode 100644 index 0000000000..7d279110ab Binary files /dev/null and b/_uploads/The changing world thumbnail_.png differ diff --git a/_uploads/Will it automate_ thumbnail.png b/_uploads/Will it automate_ thumbnail.png new file mode 100644 index 0000000000..9953bba9a5 Binary files /dev/null and b/_uploads/Will it automate_ thumbnail.png differ diff --git a/_uploads/are we ready to put ai in the hands of business users.png b/_uploads/are we ready to put ai in the hands of business users.png new file mode 100644 index 0000000000..48b4e52a35 Binary files /dev/null and b/_uploads/are we ready to put ai in the hands of business users.png differ diff --git a/_uploads/carbonhack24 thumbnail.png b/_uploads/carbonhack24 thumbnail.png new file mode 100644 index 0000000000..53bf30220a Binary files /dev/null and b/_uploads/carbonhack24 thumbnail.png differ diff --git a/_uploads/how to make chatgpt do anything you want.png b/_uploads/how to make chatgpt do anything you want.png new file mode 100644 index 0000000000..da1feeb19b Binary files /dev/null and b/_uploads/how to make chatgpt do anything you want.png differ diff --git a/_uploads/is agile always the answer_.png b/_uploads/is agile always the answer_.png new file mode 100644 index 0000000000..e58015d347 Binary files /dev/null and b/_uploads/is agile always the answer_.png differ diff --git a/_uploads/tcs.PNG b/_uploads/tcs.PNG new file mode 100644 index 0000000000..990a3d7eb4 Binary files /dev/null and b/_uploads/tcs.PNG differ diff --git a/_uploads/tech-impacts.PNG b/_uploads/tech-impacts.PNG new file mode 100644 index 0000000000..29cbe20a10 Binary files /dev/null and b/_uploads/tech-impacts.PNG differ diff --git a/bpritchard/assets/codebuild/arch-eventbridge.png b/bpritchard/assets/codebuild/arch-eventbridge.png new file mode 100644 index 0000000000..e7182a98e5 Binary files /dev/null and b/bpritchard/assets/codebuild/arch-eventbridge.png differ diff --git a/bpritchard/assets/codebuild/arch-simplified.png b/bpritchard/assets/codebuild/arch-simplified.png new file mode 100644 index 0000000000..3f9382543c Binary files /dev/null and b/bpritchard/assets/codebuild/arch-simplified.png differ diff --git a/bpritchard/assets/codebuild/sf-nightly-shutdown.png b/bpritchard/assets/codebuild/sf-nightly-shutdown.png new file mode 100644 index 0000000000..c7955d0abe Binary files /dev/null and b/bpritchard/assets/codebuild/sf-nightly-shutdown.png differ diff --git a/bpritchard/assets/codebuild/sf-poc.png b/bpritchard/assets/codebuild/sf-poc.png new file mode 100644 index 0000000000..007e6632e5 Binary files /dev/null and b/bpritchard/assets/codebuild/sf-poc.png differ diff --git a/bpritchard/assets/codebuild/sf-simple.png b/bpritchard/assets/codebuild/sf-simple.png new file mode 100644 index 0000000000..c711936ca2 Binary files /dev/null and b/bpritchard/assets/codebuild/sf-simple.png differ diff --git a/colive/assets/askingChatGPT.png b/colive/assets/askingChatGPT.png new file mode 100644 index 0000000000..71f7089647 Binary files /dev/null and b/colive/assets/askingChatGPT.png differ diff --git a/colive/assets/askingPunditBot.png b/colive/assets/askingPunditBot.png new file mode 100644 index 0000000000..11083a948b Binary files /dev/null and b/colive/assets/askingPunditBot.png differ diff --git a/colive/assets/basic.png b/colive/assets/basic.png new file mode 100644 index 0000000000..5ff2e2b684 Binary files /dev/null and b/colive/assets/basic.png differ diff --git a/colive/assets/complete.png b/colive/assets/complete.png new file mode 100644 index 0000000000..15e2f20c3e Binary files /dev/null and b/colive/assets/complete.png differ diff --git a/csalt/assets/aws-ai-and-bi/aws-quicksight-1.png b/csalt/assets/aws-ai-and-bi/aws-quicksight-1.png new file mode 100644 index 0000000000..51af22b7b7 Binary files /dev/null and b/csalt/assets/aws-ai-and-bi/aws-quicksight-1.png differ diff --git a/csalt/assets/aws-ai-and-bi/aws-quicksight-2.png b/csalt/assets/aws-ai-and-bi/aws-quicksight-2.png new file mode 100644 index 0000000000..e9c1937370 Binary files /dev/null and b/csalt/assets/aws-ai-and-bi/aws-quicksight-2.png differ diff --git a/csalt/assets/aws-ai-and-bi/aws-sagemaker-canvas-1.png b/csalt/assets/aws-ai-and-bi/aws-sagemaker-canvas-1.png new file mode 100644 index 0000000000..1700ca4c5c Binary files /dev/null and b/csalt/assets/aws-ai-and-bi/aws-sagemaker-canvas-1.png differ diff --git a/csalt/assets/aws-ai-and-bi/aws-sagemaker-canvas-2.png b/csalt/assets/aws-ai-and-bi/aws-sagemaker-canvas-2.png new file mode 100644 index 0000000000..730ce43470 Binary files /dev/null and b/csalt/assets/aws-ai-and-bi/aws-sagemaker-canvas-2.png differ diff --git a/csalt/picture.jpg b/csalt/picture.jpg index 63266d7584..99a7ce2091 100644 Binary files a/csalt/picture.jpg and b/csalt/picture.jpg differ diff --git a/dhope/assets/messagetypes/API_hit.svg b/dhope/assets/messagetypes/API_hit.svg new file mode 100644 index 0000000000..9497b1a586 --- /dev/null +++ b/dhope/assets/messagetypes/API_hit.svg @@ -0,0 +1,21 @@ + + + + + + + + Data busConsumerEventConsumerConsumerConsumerConsumerProducerAsyncREST \ No newline at end of file diff --git a/dhope/assets/messagetypes/EnrichmentPath.svg b/dhope/assets/messagetypes/EnrichmentPath.svg new file mode 100644 index 0000000000..7571beec82 --- /dev/null +++ b/dhope/assets/messagetypes/EnrichmentPath.svg @@ -0,0 +1,21 @@ + + + + + + + + SourceEnricher 1Enricher 2More dataMore dataMessage with all data \ No newline at end of file diff --git a/dhope/assets/messagetypes/different_events.svg b/dhope/assets/messagetypes/different_events.svg new file mode 100644 index 0000000000..698f54d86a --- /dev/null +++ b/dhope/assets/messagetypes/different_events.svg @@ -0,0 +1,21 @@ + + + + + + + + Order servicePlace orderPayments ServicePrices ServiceData warehouseGetpriceSync, REST callStock price T=1Stock price T=1Stock price T=1Order placed, ID=3Take paymentTrade executionExecutetradeAsync MessageTrade detailsPaymenttakenEvent messageCommand messageTime series dataState message \ No newline at end of file diff --git a/dhope/assets/messagetypes/envelope.svg b/dhope/assets/messagetypes/envelope.svg new file mode 100644 index 0000000000..d96705ed67 --- /dev/null +++ b/dhope/assets/messagetypes/envelope.svg @@ -0,0 +1,21 @@ + + + + + + + + gdfID = 1time=01/01/11{name=David}{name=Matt}ID = 2time=01/01/11EnvelopeMessage \ No newline at end of file diff --git a/dhope/assets/messagetypes/stashing.svg b/dhope/assets/messagetypes/stashing.svg new file mode 100644 index 0000000000..6fd96a684f --- /dev/null +++ b/dhope/assets/messagetypes/stashing.svg @@ -0,0 +1,21 @@ + + + + + + + + User tableEmailAddress PostcodePayment tableEmailUser IDReferencesPaymentUserEmail confirmation serviceEvents arrivingHas tableHas tableTemp tableBLOBPaymentHas tableStash whilst waiting on usermessage \ No newline at end of file diff --git a/dhope/assets/messagetypes/this_blog_last_blog.svg b/dhope/assets/messagetypes/this_blog_last_blog.svg new file mode 100644 index 0000000000..bb2ffc2b7b --- /dev/null +++ b/dhope/assets/messagetypes/this_blog_last_blog.svg @@ -0,0 +1,21 @@ + + + + + + + + Producer serviceConsumer AEventRabbit MQ BrokerConsumer BEventEventEventPrevious blogThis blog \ No newline at end of file diff --git a/dhope/assets/messagetypes/videoenriched.svg b/dhope/assets/messagetypes/videoenriched.svg new file mode 100644 index 0000000000..7f037b5448 --- /dev/null +++ b/dhope/assets/messagetypes/videoenriched.svg @@ -0,0 +1,21 @@ + + + + + + + + Final service in enrichment chainCore + Technical + EditorialEventSearch serviceRails servicePlaybackTechnical metadataEditorial DataCore Data \ No newline at end of file diff --git a/dhope/assets/messagetypes/zoomed_event.svg b/dhope/assets/messagetypes/zoomed_event.svg new file mode 100644 index 0000000000..7e4de4e48b --- /dev/null +++ b/dhope/assets/messagetypes/zoomed_event.svg @@ -0,0 +1,21 @@ + + + + + + + + Producer serviceConsumer AEventRabbit MQ BrokerConsumer BEventEventEvent \ No newline at end of file diff --git a/jisted/atom.xml b/jisted/atom.xml new file mode 100644 index 0000000000..066dced8e9 --- /dev/null +++ b/jisted/atom.xml @@ -0,0 +1,5 @@ +--- +author: jisted +layout: atom_feed +--- + diff --git a/jisted/feed.xml b/jisted/feed.xml new file mode 100644 index 0000000000..862bd9dce0 --- /dev/null +++ b/jisted/feed.xml @@ -0,0 +1,5 @@ +--- +author: jisted +layout: rss_feed +--- + diff --git a/jisted/index.html b/jisted/index.html new file mode 100644 index 0000000000..9c3f849312 --- /dev/null +++ b/jisted/index.html @@ -0,0 +1,6 @@ +--- +title: Josh Isted +author: jisted +layout: default_author +--- + diff --git a/jisted/jisted.png b/jisted/jisted.png new file mode 100644 index 0000000000..a77a7a0a5a Binary files /dev/null and b/jisted/jisted.png differ diff --git a/lperrett/lisa-profile.png b/lperrett/lisa-profile.png index fbd36e7bfd..501cdb502f 100644 Binary files a/lperrett/lisa-profile.png and b/lperrett/lisa-profile.png differ diff --git a/mgriffin/assets/common-crawl-private.svg b/mgriffin/assets/common-crawl-private.svg new file mode 100644 index 0000000000..e817589caa --- /dev/null +++ b/mgriffin/assets/common-crawl-private.svg @@ -0,0 +1,4 @@ + + + +
Inputs
Inputs
storage/replication-factor: 3
storage/replication-factor: 3
cloud-storage-metadata
cloud-storage-metada...
timestamp: '2021-04-01 00:00:00'
duration: 86832000
cloud/vendor: aws
cloud/service: s3
storage/drive-size: 10000
storage/drive-power: 6.5
storage/data-stored: 115000
grid/carbon-intensity: 396
device/emissions-embodied: 200000
device/expected-lifespan: 157680000
timestamp: '2021-04-01 00:00:00'...
storage/energy: 5408.91
storage/energy: 5408.91
storage-energy
storage-energy
operational-carbon: 2141928.36
operational-carbon: 2141928.36
carbon-intensity
carbon-intensity
resources-total: 10000
resources-total: 10000
drive-size-to-resources
drive-size-to-resour...
resources-reserved: 345000
resources-reserved: 345000
data-to-resources
data-to-resources
carbon-embodied: 3799726
carbon-embodied: 3799726
embodied-carbon
embodied-carbon
Outputs
Outputs
Text is not SVG - cannot display
\ No newline at end of file diff --git a/sconway/assets/bloom-filter-pushdown-10x-perf.png b/sconway/assets/bloom-filter-pushdown-10x-perf.png new file mode 100644 index 0000000000..ac1fe39004 Binary files /dev/null and b/sconway/assets/bloom-filter-pushdown-10x-perf.png differ diff --git a/sconway/assets/spark-connect-communication.png b/sconway/assets/spark-connect-communication.png new file mode 100644 index 0000000000..9804ee5d0e Binary files /dev/null and b/sconway/assets/spark-connect-communication.png differ diff --git a/sconway/assets/spark.2.3.vs.3.2.1.concurrent.png b/sconway/assets/spark.2.3.vs.3.2.1.concurrent.png new file mode 100644 index 0000000000..cdd2b5b844 Binary files /dev/null and b/sconway/assets/spark.2.3.vs.3.2.1.concurrent.png differ diff --git a/sconway/assets/spark.2.3.vs.3.2.1.sequential.png b/sconway/assets/spark.2.3.vs.3.2.1.sequential.png new file mode 100644 index 0000000000..cc8c429509 Binary files /dev/null and b/sconway/assets/spark.2.3.vs.3.2.1.sequential.png differ diff --git a/sconway/assets/torchdistributor.png b/sconway/assets/torchdistributor.png new file mode 100644 index 0000000000..f402f82fea Binary files /dev/null and b/sconway/assets/torchdistributor.png differ