diff --git a/NAMESPACE b/NAMESPACE index 200d85b..33a9ea7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +S3method(llm_classify,"tbl_Spark SQL") S3method(llm_classify,data.frame) S3method(llm_custom,data.frame) S3method(llm_extract,data.frame) @@ -34,6 +35,7 @@ import(rlang) importFrom(dplyr,bind_cols) importFrom(dplyr,mutate) importFrom(dplyr,pull) +importFrom(dplyr,sql) importFrom(dplyr,tibble) importFrom(jsonlite,fromJSON) importFrom(jsonlite,read_json) diff --git a/R/llm-classify.R b/R/llm-classify.R index 3679a44..f6a4ae1 100644 --- a/R/llm-classify.R +++ b/R/llm-classify.R @@ -77,6 +77,21 @@ llm_classify.data.frame <- function(.data, ) } +#' @export +`llm_classify.tbl_Spark SQL` <- function(.data, + col, + labels, + pred_name = ".classify", + additional_prompt = "") { + prep_labels <- paste0("'", labels, "'", collapse = ", ") + mutate( + .data = .data, + !!pred_name := ai_classify({{ col }}, array(sql(prep_labels))) + ) +} + +globalVariables(c("ai_classify", "array")) + #' @rdname llm_classify #' @export llm_vec_classify <- function(x, diff --git a/R/mall.R b/R/mall.R index 36e63e8..e57f757 100644 --- a/R/mall.R +++ b/R/mall.R @@ -1,5 +1,5 @@ #' @importFrom ollamar chat test_connection list_models -#' @importFrom dplyr mutate tibble bind_cols pull +#' @importFrom dplyr mutate tibble bind_cols pull sql #' @importFrom utils menu head #' @importFrom jsonlite fromJSON read_json write_json #' @import fs diff --git a/_freeze/articles/caching/execute-results/html.json b/_freeze/articles/caching/execute-results/html.json new file mode 100644 index 0000000..3f165ab --- /dev/null +++ b/_freeze/articles/caching/execute-results/html.json @@ -0,0 +1,15 @@ +{ + "hash": "9d356c51f2bf3faca6acf1dd4add9abf", + "result": { + "engine": "knitr", + "markdown": "---\ntitle: \"Caching results\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\nData preparation, and model preparation, is usually a iterative process. Because\nmodels in R are normally rather fast, it is not a problem to re-run the\nentire code to confirm that all of the results are reproducible. But in\nthe case of LLM's, re-running things may be a problem. Locally, running the \nLLM will be processor intensive, and typically long. If running against a remote\nLLM, the issue would the cost per token. \n\nTo ameliorate this, `mall` is able to cache existing results in a folder. That way, \nrunning the same analysis over and over, will be much quicker. Because instead of\ncalling the LLM again, `mall` will return the previously recorded result. \n\nBy default, this functionality is turned on. The results will be saved to a folder\nnamed \"_mall_cache\" . The name of the folder can be easily changed, simply set\nthe `.cache` argument in `llm_use()`. To **disable** this functionality, set\nthe argument to an empty character, meaning `.cache = \"\"`.\n\n## How it works\n\n`mall` uses all of the values used to make the LLM query as the \"finger print\"\nto confidently identify when the same query is being done again. This includes:\n\n- The value in the particular row\n- The additional prompting built by the `llm_` function,\n- Any other arguments/options used, set in `llm_use()`\n- The name of the back end used for the call\n\nA file is created that contains the request and response. The key to the process\nis the name of the file itself. The name is the hashed value of the combined\nvalue of the items listed above. This becomes the \"finger print\" that allows \n`mall` to know if there is an existing cache. \n\n## Walk-through \n\nWe will initialize the LLM session specifying a seed\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\n\nllm_use(\"ollama\", \"llama3.1\", seed = 100)\n#> \n#> ── mall session object\n#> Backend: ollama\n#> LLM session:\n#> model:llama3.1\n#> \n#> seed:100\n#> \n#> R session: cache_folder:_mall_cache\n```\n:::\n\n\n\n\nUsing the `tictoc` package, we will measure how long it takes to make a simple\nsentiment call. \n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tictoc)\n\ntic()\nllm_vec_sentiment(\"I am happy\")\n#> [1] \"positive\"\ntoc()\n#> 1.266 sec elapsed\n```\n:::\n\n\n\n\nThis creates a the \"_mall_cache\" folder, and inside a sub-folder, it creates a \nfile with the cache. The name of the file is the resulting hash value of the\ncombination mentioned in the previous section. \n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndir_ls(\"_mall_cache\", recurse = TRUE, type = \"file\")\n#> _mall_cache/08/086214f2638f60496fd0468d7de37c59.json\n```\n:::\n\n\n\n\nThe cache is a JSON file, that contains both the request, and the response. As\nmentioned in the previous section, the named of the file is derived from the\ncombining the values in the request (`$request`).\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\njsonlite::read_json(\n \"_mall_cache/08/086214f2638f60496fd0468d7de37c59.json\", \n simplifyVector = TRUE, \n flatten = TRUE\n )\n#> $request\n#> $request$messages\n#> role\n#> 1 user\n#> content\n#> 1 You are a helpful sentiment engine. Return only one of the following answers: positive, negative, neutral. No capitalization. No explanations. The answer is based on the following text:\\nI am happy\n#> \n#> $request$output\n#> [1] \"text\"\n#> \n#> $request$model\n#> [1] \"llama3.1\"\n#> \n#> $request$seed\n#> [1] 100\n#> \n#> \n#> $response\n#> [1] \"positive\"\n```\n:::\n\n\n\n\nRe-running the same `mall` call, will complete significantly faster\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntic()\nllm_vec_sentiment(\"I am happy\")\n#> [1] \"positive\"\ntoc()\n#> 0.001 sec elapsed\n```\n:::\n\n\n\n\nIf a slightly different query is made, `mall` will recognize that this is a\ndifferent call, and it will send it to the LLM. The results are then saved in a \nnew JSON file. \n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_sentiment(\"I am very happy\")\n#> [1] \"positive\"\n\ndir_ls(\"_mall_cache\", recurse = TRUE, type = \"file\")\n#> _mall_cache/08/086214f2638f60496fd0468d7de37c59.json\n#> _mall_cache/7c/7c7cfcfddc43a90b4deb9d7e60e88291.json\n```\n:::\n\n\n\n\nDuring the same R session, if we change something in `llm_use()` that will\nimpact the request to the LLM, that will trigger a new cache file\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(seed = 101)\n#> \n#> ── mall session object\n#> Backend: ollama\n#> LLM session:\n#> model:llama3.1\n#> \n#> seed:101\n#> \n#> R session: cache_folder:_mall_cache\n\nllm_vec_sentiment(\"I am very happy\")\n#> [1] \"positive\"\n\ndir_ls(\"_mall_cache\", recurse = TRUE, type = \"file\")\n#> _mall_cache/08/086214f2638f60496fd0468d7de37c59.json\n#> _mall_cache/7c/7c7cfcfddc43a90b4deb9d7e60e88291.json\n#> _mall_cache/f1/f1c72c2bf22e22074cef9c859d6344a6.json\n```\n:::\n\n\n\n\nThe only argument that does not trigger a new cache file is `.silent`\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(seed = 101, .silent = TRUE)\n\nllm_vec_sentiment(\"I am very happy\")\n#> [1] \"positive\"\n\ndir_ls(\"_mall_cache\", recurse = TRUE, type = \"file\")\n#> _mall_cache/08/086214f2638f60496fd0468d7de37c59.json\n#> _mall_cache/7c/7c7cfcfddc43a90b4deb9d7e60e88291.json\n#> _mall_cache/f1/f1c72c2bf22e22074cef9c859d6344a6.json\n```\n:::\n\n\n\n\n## Performance improvements \n\nTo drive home the point of the usefulness of this feature, we will use the\nsame data set we used for the README. To start, we will change the cache folder\nto make it easy to track the new files\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(.cache = \"_performance_cache\", .silent = TRUE)\n```\n:::\n\n\n\nAs mentioned, we will use the `data_bookReviews` data frame from the `classmap`\npackage\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(classmap)\n\ndata(data_bookReviews)\n```\n:::\n\n\n\n\nThe individual reviews in this data set are really long. So they take a while to\nprocess. To run this test, we will use the first 5 rows: \n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntic()\n\ndata_bookReviews |>\n head(5) |> \n llm_sentiment(review)\n#> # A tibble: 5 × 3\n#> review sentiment .sentiment\n#> \n#> 1 \"i got this as both a book and an audio file… 1 negative \n#> 2 \"this book places too much emphasis on spend… 1 negative \n#> 3 \"remember the hollywood blacklist? the holly… 2 negative \n#> 4 \"while i appreciate what tipler was attempti… 1 negative \n#> 5 \"the others in the series were great, and i … 1 negative\n\ntoc()\n#> 10.223 sec elapsed\n```\n:::\n\n\n\n\nThe analysis took about 10 seconds on my laptop, so around 2 seconds per record.\nThat may not seem like much, but during model, or workflow, development having\nto wait this long every time will take its toll on our time, and patience.\n\nThe new cache folder now has the 5 records cached in their corresponding \nJSON files\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndir_ls(\"_performance_cache\", recurse = TRUE, type = \"file\")\n#> _performance_cache/23/23ea4fff55a6058db3b4feefe447ddeb.json\n#> _performance_cache/60/60a0dbb7d3b8133d40e2f74deccdbf47.json\n#> _performance_cache/76/76f1b84b70328b1b3533436403914217.json\n#> _performance_cache/c7/c7cf6e0f9683ae29eba72b0a4dd4b189.json\n#> _performance_cache/e3/e375559b424833d17c7bcb067fe6b0f8.json\n```\n:::\n\n\n\n\nRe-running the same exact call will not take a fraction of a fraction of the\noriginal time!\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntic()\n\ndata_bookReviews |>\n head(5) |> \n llm_sentiment(review)\n#> # A tibble: 5 × 3\n#> review sentiment .sentiment\n#> \n#> 1 \"i got this as both a book and an audio file… 1 negative \n#> 2 \"this book places too much emphasis on spend… 1 negative \n#> 3 \"remember the hollywood blacklist? the holly… 2 negative \n#> 4 \"while i appreciate what tipler was attempti… 1 negative \n#> 5 \"the others in the series were great, and i … 1 negative\n\ntoc()\n#> 0.01 sec elapsed\n```\n:::\n\n\n\n\nRunning an additional record, will only cost the time it takes to process it.\nThe other 5 will still be scored using their cached result\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntic()\n\ndata_bookReviews |>\n head(6) |> \n llm_sentiment(review)\n#> # A tibble: 6 × 3\n#> review sentiment .sentiment\n#> \n#> 1 \"i got this as both a book and an audio file… 1 negative \n#> 2 \"this book places too much emphasis on spend… 1 negative \n#> 3 \"remember the hollywood blacklist? the holly… 2 negative \n#> 4 \"while i appreciate what tipler was attempti… 1 negative \n#> 5 \"the others in the series were great, and i … 1 negative \n#> 6 \"a few good things, but she's lost her edge … 1 negative\n\ntoc()\n#> 0.624 sec elapsed\n```\n:::\n\n\n\n\n## Set the seed!\n\nIf at the end of your analysis, you plan to re-run all of the code, and you\nwant to take advantage of the caching functionaly, then set the model seed. This\nwill allow for the exact same results to be returned by the LLM.\n\nIf no seed is set during development, then the results will always come back \nthe same because the cache is being read. But once the cache is removed, to run \neverything from 0, then you will get different results. This is because the \ninvariability of the cache results, mask the fact that the model will have \nvariability. \n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(\"ollama\", \"llama3.1\", seed = 999)\n#> \n#> ── mall session object\n#> Backend: ollama\n#> LLM session:\n#> model:llama3.1\n#> \n#> seed:999\n#> \n#> R session: cache_folder:_performance_cache\n```\n:::\n", + "supporting": [], + "filters": [ + "rmarkdown/pagebreak.lua" + ], + "includes": {}, + "engineDependencies": {}, + "preserve": {}, + "postProcess": true + } +} \ No newline at end of file diff --git a/_freeze/articles/databricks/execute-results/html.json b/_freeze/articles/databricks/execute-results/html.json index 603b6e8..d7f0faf 100644 --- a/_freeze/articles/databricks/execute-results/html.json +++ b/_freeze/articles/databricks/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "50703f906f5d84aa29e81d59b39892f0", + "hash": "14b9a746cd4616068294da1820cdc83b", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Databricks\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n\n\nThis brief example shows how seamless it is to use the same functions,\nbut against a remote database connection. Today, it works with the following\nfunctions:\n\n- `llm_sentiment()` / `llm_vec_sentiment()`\n- `llm_summarize()` / `llm_vec_summarize()`\n\n## Examples\n\nWe will start by connecting to the Databricks Warehouse\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\nlibrary(DBI)\n\ncon <- dbConnect(\n odbc::databricks(),\n HTTPPath = Sys.getenv(\"DATABRICKS_PATH\")\n)\n```\n:::\n\n\n\n\nNext, we will create a small reviews table\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(dplyr)\n\nreviews <- tribble(\n ~review,\n \"This has been the best TV I've ever used. Great screen, and sound.\",\n \"I regret buying this laptop. It is too slow and the keyboard is too noisy\",\n \"Not sure how to feel about my new washing machine. Great color, but hard to figure\"\n)\n\ntbl_reviews <- copy_to(con, reviews)\n```\n:::\n\n\n\n\nUsing `llm_sentiment()` in Databricks will call that vendor's SQL AI function\ndirectly:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntbl_reviews |>\n llm_sentiment(review)\n#> # Source: SQL [3 x 2]\n#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]\n#> review .sentiment\n#> \n#> 1 This has been the best TV Ive ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… mixed\n```\n:::\n\n\n\n\nThere are some differences in the arguments, and output of the LLM's. Notice\nthat instead of \"neutral\", the prediction is \"mixed\". The AI Sentiment function\ndoes not allow to change the possible options.\n\nNext, we will try `llm_summarize()`. The `max_words` argument maps to the same\nargument in the AI Summarize function:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntbl_reviews |>\n llm_summarize(review, max_words = 5)\n#> # Source: SQL [3 x 2]\n#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]\n#> review .summary\n#> \n#> 1 This has been the best TV Ive ever used. Great screen, and sound. Superio…\n#> 2 I regret buying this laptop. It is too slow and the keyboard is too … Slow, n…\n#> 3 Not sure how to feel about my new washing machine. Great color, but … Initial…\n```\n:::\n", + "markdown": "---\ntitle: \"Databricks\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n\n\nThis brief example shows how seamless it is to use the same functions,\nbut against a remote database connection. Today, it works with the following\nfunctions:\n\n- `llm_sentiment()`\n- `llm_summarize()`\n- `llm_classify()`\n\n## Examples\n\nWe will start by connecting to the Databricks Warehouse\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\nlibrary(DBI)\n\ncon <- dbConnect(\n odbc::databricks(),\n HTTPPath = Sys.getenv(\"DATABRICKS_PATH\")\n)\n```\n:::\n\n\n\n\nNext, we will create a small reviews table\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(dplyr)\n\nreviews <- tribble(\n ~review,\n \"This has been the best TV I've ever used. Great screen, and sound.\",\n \"I regret buying this laptop. It is too slow and the keyboard is too noisy\",\n \"Not sure how to feel about my new washing machine. Great color, but hard to figure\"\n)\n\ntbl_reviews <- copy_to(con, reviews, overwrite = TRUE)\n```\n:::\n\n\n\n\nUsing `llm_sentiment()` in Databricks will call that vendor's SQL AI function\ndirectly:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntbl_reviews |>\n llm_sentiment(review)\n#> # Source: SQL [3 x 2]\n#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]\n#> review .sentiment\n#> \n#> 1 This has been the best TV Ive ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… mixed\n```\n:::\n\n\n\n\nThere are some differences in the arguments, and output of the LLM's. Notice\nthat instead of \"neutral\", the prediction is \"mixed\". The AI Sentiment function\ndoes not allow to change the possible options.\n\nNext, we will try `llm_summarize()`. The `max_words` argument maps to the same\nargument in the AI Summarize function:\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntbl_reviews |>\n llm_summarize(review, max_words = 5) |> \n show_query()\n#> \n#> SELECT `reviews`.*, ai_summarize(`review`, CAST(5.0 AS INT)) AS `.summary`\n#> FROM `reviews`\n```\n:::\n\n\n\n\n`llm_classify()` for this back-end, will only accept unnamed options. \n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntbl_reviews |> \n llm_classify(review, c(\"appliance\", \"computer\"))\n#> # Source: SQL [3 x 2]\n#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]\n#> review .classify\n#> \n#> 1 This has been the best TV Ive ever used. Great screen, and sound. appliance\n#> 2 I regret buying this laptop. It is too slow and the keyboard is too… computer \n#> 3 Not sure how to feel about my new washing machine. Great color, but… appliance\n```\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_quarto.yml b/_quarto.yml index 3a7230a..7ed0039 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -25,8 +25,10 @@ website: background: light collapse-level: 2 contents: + - text: "Caching" + href: articles/caching.qmd - text: "Databricks" - href: articles/databricks.qmd + href: articles/databricks.qmd format: html: diff --git a/articles/caching.qmd b/articles/caching.qmd new file mode 100644 index 0000000..5b6653c --- /dev/null +++ b/articles/caching.qmd @@ -0,0 +1,211 @@ +--- +title: "Caching results" +execute: + eval: true + freeze: true +--- + +```{r} +#| include: false +library(fs) +library(jsonlite) +dir_delete("_mall_cache") +dir_delete("_performance_cache") +source("../utils/knitr-print.R") +``` + +Data preparation, and model preparation, is usually a iterative process. Because +models in R are normally rather fast, it is not a problem to re-run the +entire code to confirm that all of the results are reproducible. But in +the case of LLM's, re-running things may be a problem. Locally, running the +LLM will be processor intensive, and typically long. If running against a remote +LLM, the issue would the cost per token. + +To ameliorate this, `mall` is able to cache existing results in a folder. That way, +running the same analysis over and over, will be much quicker. Because instead of +calling the LLM again, `mall` will return the previously recorded result. + +By default, this functionality is turned on. The results will be saved to a folder +named "_mall_cache" . The name of the folder can be easily changed, simply set +the `.cache` argument in `llm_use()`. To **disable** this functionality, set +the argument to an empty character, meaning `.cache = ""`. + +## How it works + +`mall` uses all of the values used to make the LLM query as the "finger print" +to confidently identify when the same query is being done again. This includes: + +- The value in the particular row +- The additional prompting built by the `llm_` function, +- Any other arguments/options used, set in `llm_use()` +- The name of the back end used for the call + +A file is created that contains the request and response. The key to the process +is the name of the file itself. The name is the hashed value of the combined +value of the items listed above. This becomes the "finger print" that allows +`mall` to know if there is an existing cache. + +## Walk-through + +We will initialize the LLM session specifying a seed + +```{r} +library(mall) + +llm_use("ollama", "llama3.1", seed = 100) +``` + +Using the `tictoc` package, we will measure how long it takes to make a simple +sentiment call. + +```{r} +library(tictoc) + +tic() +llm_vec_sentiment("I am happy") +toc() +``` + +This creates a the "_mall_cache" folder, and inside a sub-folder, it creates a +file with the cache. The name of the file is the resulting hash value of the +combination mentioned in the previous section. + +```{r} +dir_ls("_mall_cache", recurse = TRUE, type = "file") +``` + +The cache is a JSON file, that contains both the request, and the response. As +mentioned in the previous section, the named of the file is derived from the +combining the values in the request (`$request`). + +```{r} +jsonlite::read_json( + "_mall_cache/08/086214f2638f60496fd0468d7de37c59.json", + simplifyVector = TRUE, + flatten = TRUE +) +``` + +Re-running the same `mall` call, will complete significantly faster + +```{r} +tic() +llm_vec_sentiment("I am happy") +toc() +``` + +If a slightly different query is made, `mall` will recognize that this is a +different call, and it will send it to the LLM. The results are then saved in a +new JSON file. + +```{r} +llm_vec_sentiment("I am very happy") + +dir_ls("_mall_cache", recurse = TRUE, type = "file") +``` + +During the same R session, if we change something in `llm_use()` that will +impact the request to the LLM, that will trigger a new cache file + +```{r} +llm_use(seed = 101) + +llm_vec_sentiment("I am very happy") + +dir_ls("_mall_cache", recurse = TRUE, type = "file") +``` + +The only argument that does not trigger a new cache file is `.silent` + +```{r} +llm_use(seed = 101, .silent = TRUE) + +llm_vec_sentiment("I am very happy") + +dir_ls("_mall_cache", recurse = TRUE, type = "file") +``` + +## Performance improvements + +To drive home the point of the usefulness of this feature, we will use the +same data set we used for the README. To start, we will change the cache folder +to make it easy to track the new files + +```{r} +llm_use(.cache = "_performance_cache", .silent = TRUE) +``` +As mentioned, we will use the `data_bookReviews` data frame from the `classmap` +package + +```{r} +library(classmap) + +data(data_bookReviews) +``` + +The individual reviews in this data set are really long. So they take a while to +process. To run this test, we will use the first 5 rows: + +```{r} +tic() + +data_bookReviews |> + head(5) |> + llm_sentiment(review) + +toc() +``` + +The analysis took about 10 seconds on my laptop, so around 2 seconds per record. +That may not seem like much, but during model, or workflow, development having +to wait this long every time will take its toll on our time, and patience. + +The new cache folder now has the 5 records cached in their corresponding +JSON files + +```{r} +dir_ls("_performance_cache", recurse = TRUE, type = "file") +``` + +Re-running the same exact call will not take a fraction of a fraction of the +original time! + +```{r} +tic() + +data_bookReviews |> + head(5) |> + llm_sentiment(review) + +toc() +``` + +Running an additional record, will only cost the time it takes to process it. +The other 5 will still be scored using their cached result + +```{r} +tic() + +data_bookReviews |> + head(6) |> + llm_sentiment(review) + +toc() +``` + +## Set the seed! + +If at the end of your analysis, you plan to re-run all of the code, and you +want to take advantage of the caching functionaly, then set the model seed. This +will allow for the exact same results to be returned by the LLM. + +If no seed is set during development, then the results will always come back +the same because the cache is being read. But once the cache is removed, to run +everything from 0, then you will get different results. This is because the +invariability of the cache results, mask the fact that the model will have +variability. + +```{r} +llm_use("ollama", "llama3.1", seed = 999) +``` + diff --git a/articles/databricks.qmd b/articles/databricks.qmd index b42192b..d56c810 100644 --- a/articles/databricks.qmd +++ b/articles/databricks.qmd @@ -18,8 +18,9 @@ This brief example shows how seamless it is to use the same functions, but against a remote database connection. Today, it works with the following functions: -- `llm_sentiment()` / `llm_vec_sentiment()` -- `llm_summarize()` / `llm_vec_summarize()` +- `llm_sentiment()` +- `llm_summarize()` +- `llm_classify()` ## Examples @@ -47,7 +48,7 @@ reviews <- tribble( "Not sure how to feel about my new washing machine. Great color, but hard to figure" ) -tbl_reviews <- copy_to(con, reviews) +tbl_reviews <- copy_to(con, reviews, overwrite = TRUE) ``` Using `llm_sentiment()` in Databricks will call that vendor's SQL AI function @@ -67,5 +68,15 @@ argument in the AI Summarize function: ```{r} tbl_reviews |> - llm_summarize(review, max_words = 5) + llm_summarize(review, max_words = 5) |> + show_query() ``` + +`llm_classify()` for this back-end, will only accept unnamed options. + +```{r} +tbl_reviews |> + llm_classify(review, c("appliance", "computer")) +``` + + diff --git a/tests/testthat/_snaps/llm-classify.md b/tests/testthat/_snaps/llm-classify.md index 36a1524..6584816 100644 --- a/tests/testthat/_snaps/llm-classify.md +++ b/tests/testthat/_snaps/llm-classify.md @@ -1,3 +1,20 @@ +# Classify translates expected Spark SQL + + Code + llm_classify(df_spark, x, c("a", "b")) + Output + + SELECT `df`.*, ai_classify(`x`, array('a', 'b')) AS `.classify` + FROM `df` + +# Preview works + + Code + llm_vec_classify("this is a test", c("a", "b"), preview = TRUE) + Output + ollamar::chat(messages = list(list(role = "user", content = "You are a helpful classification engine. Determine if the text refers to one of the following: a, b. No capitalization. No explanations. The answer is based on the following text:\nthis is a test")), + output = "text", model = "llama3.1", seed = 100) + # Classify on Ollama works Code @@ -42,11 +59,3 @@ 2 appliance 3 appliance -# Preview works - - Code - llm_vec_classify("this is a test", c("a", "b"), preview = TRUE) - Output - ollamar::chat(messages = list(list(role = "user", content = "You are a helpful classification engine. Determine if the text refers to one of the following: a, b. No capitalization. No explanations. The answer is based on the following text:\nthis is a test")), - output = "text", model = "llama3.1", seed = 100) - diff --git a/tests/testthat/test-llm-classify.R b/tests/testthat/test-llm-classify.R index 009bd4d..ab48270 100644 --- a/tests/testthat/test-llm-classify.R +++ b/tests/testthat/test-llm-classify.R @@ -26,6 +26,20 @@ test_that("Classify works", { ) }) +test_that("Classify translates expected Spark SQL", { + suppressPackageStartupMessages(library(dbplyr)) + df <- data.frame(x = 1) + df_spark <- tbl_lazy(df, con = simulate_spark_sql()) + expect_snapshot(llm_classify(df_spark, x, c("a", "b"))) +}) + +test_that("Preview works", { + llm_use("ollama", "llama3.1", seed = 100, .silent = FALSE) + expect_snapshot( + llm_vec_classify("this is a test", c("a", "b"), preview = TRUE) + ) +}) + test_that("Classify on Ollama works", { skip_if_no_ollama() reviews <- reviews_table() @@ -54,10 +68,3 @@ test_that("Classify on Ollama works", { ) ) }) - -test_that("Preview works", { - llm_use("ollama", "llama3.1", seed = 100, .silent = FALSE) - expect_snapshot( - llm_vec_classify("this is a test", c("a", "b"), preview = TRUE) - ) -})