From 6e20e64727005f4e5e16fb04cfcf1492ddc85e53 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 30 Sep 2024 19:45:04 -0500 Subject: [PATCH 01/57] Starts folder restructure --- _pkgdown.yml | 13 ------------- DESCRIPTION => r/DESCRIPTION | 0 LICENSE => r/LICENSE | 0 LICENSE.md => r/LICENSE.md | 0 NAMESPACE => r/NAMESPACE | 0 {R => r/R}/data-reviews.R | 0 {R => r/R}/import-standalone-purrr.R | 0 {R => r/R}/llm-classify.R | 0 {R => r/R}/llm-custom.R | 0 {R => r/R}/llm-extract.R | 0 {R => r/R}/llm-sentiment.R | 0 {R => r/R}/llm-summarize.R | 0 {R => r/R}/llm-translate.R | 0 {R => r/R}/llm-use.R | 0 {R => r/R}/m-backend-prompt.R | 0 {R => r/R}/m-backend-submit.R | 0 {R => r/R}/m-cache.R | 0 {R => r/R}/m-defaults.R | 0 {R => r/R}/m-vec-prompt.R | 0 {R => r/R}/mall.R | 0 {R => r/R}/utils.R | 0 README.Rmd => r/README.Rmd | 0 README.md => r/README.md | 0 {articles => r/articles}/caching.qmd | 0 {articles => r/articles}/databricks.qmd | 0 codecov.yml => r/codecov.yml | 0 {data => r/data}/reviews.rda | Bin index.qmd => r/index.qmd | 0 mall.Rproj => r/mall.Rproj | 0 .../figures/favicon/apple-touch-icon-120x120.png | Bin .../figures/favicon/apple-touch-icon-152x152.png | Bin .../figures/favicon/apple-touch-icon-180x180.png | Bin .../figures/favicon/apple-touch-icon-60x60.png | Bin .../figures/favicon/apple-touch-icon-76x76.png | Bin .../man}/figures/favicon/apple-touch-icon.png | Bin {man => r/man}/figures/favicon/favicon-16x16.png | Bin {man => r/man}/figures/favicon/favicon-32x32.png | Bin {man => r/man}/figures/favicon/favicon.ico | Bin {man => r/man}/figures/logo.png | Bin {man => r/man}/figures/mall.png | Bin {man => r/man}/llm_classify.Rd | 0 {man => r/man}/llm_custom.Rd | 0 {man => r/man}/llm_extract.Rd | 0 {man => r/man}/llm_sentiment.Rd | 0 {man => r/man}/llm_summarize.Rd | 0 {man => r/man}/llm_translate.Rd | 0 {man => r/man}/llm_use.Rd | 0 {man => r/man}/m_backend_submit.Rd | 0 {man => r/man}/reviews.Rd | 0 {reference => r/reference}/index.qmd | 0 {reference => r/reference}/llm_classify.qmd | 0 {reference => r/reference}/llm_custom.qmd | 0 {reference => r/reference}/llm_extract.qmd | 0 {reference => r/reference}/llm_sentiment.qmd | 0 {reference => r/reference}/llm_summarize.qmd | 0 {reference => r/reference}/llm_translate.qmd | 0 {reference => r/reference}/llm_use.qmd | 0 {reference => r/reference}/m_backend_submit.qmd | 0 {reference => r/reference}/reviews.qmd | 0 {tests => r/tests}/testthat.R | 0 {tests => r/tests}/testthat/_snaps/llm-classify.md | 0 {tests => r/tests}/testthat/_snaps/llm-custom.md | 0 {tests => r/tests}/testthat/_snaps/llm-extract.md | 0 .../tests}/testthat/_snaps/llm-sentiment.md | 0 .../tests}/testthat/_snaps/llm-summarize.md | 0 .../tests}/testthat/_snaps/llm-translate.md | 0 {tests => r/tests}/testthat/_snaps/llm-use.md | 0 {tests => r/tests}/testthat/_snaps/zzz-cache.md | 0 {tests => r/tests}/testthat/helper-ollama.R | 0 {tests => r/tests}/testthat/test-llm-classify.R | 0 {tests => r/tests}/testthat/test-llm-custom.R | 0 {tests => r/tests}/testthat/test-llm-extract.R | 0 {tests => r/tests}/testthat/test-llm-sentiment.R | 0 {tests => r/tests}/testthat/test-llm-summarize.R | 0 {tests => r/tests}/testthat/test-llm-translate.R | 0 {tests => r/tests}/testthat/test-llm-use.R | 0 .../tests}/testthat/test-m-backend-prompt.R | 0 .../tests}/testthat/test-m-backend-submit.R | 0 {tests => r/tests}/testthat/test-zzz-cache.R | 0 {utils => r/utils}/knitr-print.R | 0 {utils => r/utils}/website/README.md | 0 {utils => r/utils}/website/_reference.qmd | 0 {utils => r/utils}/website/build_reference.R | 0 {utils => r/utils}/website/index-page.R | 0 {utils => r/utils}/website/list-to-qmd.R | 0 {utils => r/utils}/website/rd-to-list.R | 0 86 files changed, 13 deletions(-) delete mode 100644 _pkgdown.yml rename DESCRIPTION => r/DESCRIPTION (100%) rename LICENSE => r/LICENSE (100%) rename LICENSE.md => r/LICENSE.md (100%) rename NAMESPACE => r/NAMESPACE (100%) rename {R => r/R}/data-reviews.R (100%) rename {R => r/R}/import-standalone-purrr.R (100%) rename {R => r/R}/llm-classify.R (100%) rename {R => r/R}/llm-custom.R (100%) rename {R => r/R}/llm-extract.R (100%) rename {R => r/R}/llm-sentiment.R (100%) rename {R => r/R}/llm-summarize.R (100%) rename {R => r/R}/llm-translate.R (100%) rename {R => r/R}/llm-use.R (100%) rename {R => r/R}/m-backend-prompt.R (100%) rename {R => r/R}/m-backend-submit.R (100%) rename {R => r/R}/m-cache.R (100%) rename {R => r/R}/m-defaults.R (100%) rename {R => r/R}/m-vec-prompt.R (100%) rename {R => r/R}/mall.R (100%) rename {R => r/R}/utils.R (100%) rename README.Rmd => r/README.Rmd (100%) rename README.md => r/README.md (100%) rename {articles => r/articles}/caching.qmd (100%) rename {articles => r/articles}/databricks.qmd (100%) rename codecov.yml => r/codecov.yml (100%) rename {data => r/data}/reviews.rda (100%) rename index.qmd => r/index.qmd (100%) rename mall.Rproj => r/mall.Rproj (100%) rename {man => r/man}/figures/favicon/apple-touch-icon-120x120.png (100%) rename {man => r/man}/figures/favicon/apple-touch-icon-152x152.png (100%) rename {man => r/man}/figures/favicon/apple-touch-icon-180x180.png (100%) rename {man => r/man}/figures/favicon/apple-touch-icon-60x60.png (100%) rename {man => r/man}/figures/favicon/apple-touch-icon-76x76.png (100%) rename {man => r/man}/figures/favicon/apple-touch-icon.png (100%) rename {man => r/man}/figures/favicon/favicon-16x16.png (100%) rename {man => r/man}/figures/favicon/favicon-32x32.png (100%) rename {man => r/man}/figures/favicon/favicon.ico (100%) rename {man => r/man}/figures/logo.png (100%) rename {man => r/man}/figures/mall.png (100%) rename {man => r/man}/llm_classify.Rd (100%) rename {man => r/man}/llm_custom.Rd (100%) rename {man => r/man}/llm_extract.Rd (100%) rename {man => r/man}/llm_sentiment.Rd (100%) rename {man => r/man}/llm_summarize.Rd (100%) rename {man => r/man}/llm_translate.Rd (100%) rename {man => r/man}/llm_use.Rd (100%) rename {man => r/man}/m_backend_submit.Rd (100%) rename {man => r/man}/reviews.Rd (100%) rename {reference => r/reference}/index.qmd (100%) rename {reference => r/reference}/llm_classify.qmd (100%) rename {reference => r/reference}/llm_custom.qmd (100%) rename {reference => r/reference}/llm_extract.qmd (100%) rename {reference => r/reference}/llm_sentiment.qmd (100%) rename {reference => r/reference}/llm_summarize.qmd (100%) rename {reference => r/reference}/llm_translate.qmd (100%) rename {reference => r/reference}/llm_use.qmd (100%) rename {reference => r/reference}/m_backend_submit.qmd (100%) rename {reference => r/reference}/reviews.qmd (100%) rename {tests => r/tests}/testthat.R (100%) rename {tests => r/tests}/testthat/_snaps/llm-classify.md (100%) rename {tests => r/tests}/testthat/_snaps/llm-custom.md (100%) rename {tests => r/tests}/testthat/_snaps/llm-extract.md (100%) rename {tests => r/tests}/testthat/_snaps/llm-sentiment.md (100%) rename {tests => r/tests}/testthat/_snaps/llm-summarize.md (100%) rename {tests => r/tests}/testthat/_snaps/llm-translate.md (100%) rename {tests => r/tests}/testthat/_snaps/llm-use.md (100%) rename {tests => r/tests}/testthat/_snaps/zzz-cache.md (100%) rename {tests => r/tests}/testthat/helper-ollama.R (100%) rename {tests => r/tests}/testthat/test-llm-classify.R (100%) rename {tests => r/tests}/testthat/test-llm-custom.R (100%) rename {tests => r/tests}/testthat/test-llm-extract.R (100%) rename {tests => r/tests}/testthat/test-llm-sentiment.R (100%) rename {tests => r/tests}/testthat/test-llm-summarize.R (100%) rename {tests => r/tests}/testthat/test-llm-translate.R (100%) rename {tests => r/tests}/testthat/test-llm-use.R (100%) rename {tests => r/tests}/testthat/test-m-backend-prompt.R (100%) rename {tests => r/tests}/testthat/test-m-backend-submit.R (100%) rename {tests => r/tests}/testthat/test-zzz-cache.R (100%) rename {utils => r/utils}/knitr-print.R (100%) rename {utils => r/utils}/website/README.md (100%) rename {utils => r/utils}/website/_reference.qmd (100%) rename {utils => r/utils}/website/build_reference.R (100%) rename {utils => r/utils}/website/index-page.R (100%) rename {utils => r/utils}/website/list-to-qmd.R (100%) rename {utils => r/utils}/website/rd-to-list.R (100%) diff --git a/_pkgdown.yml b/_pkgdown.yml deleted file mode 100644 index 3af800a..0000000 --- a/_pkgdown.yml +++ /dev/null @@ -1,13 +0,0 @@ -url: https://edgararuiz.github.io/mall/ -template: - bootstrap: 5 - light-switch: true -navbar: - structure: - right: [search, github, lightswitch] -repo: - url: - home: https://github.com/edgararuiz/mall - source: https://github.com/edgararuiz/mall/blob/HEAD/ - issue: https://github.com/edgararuiz/mall/issues/ - user: https://github.com/ diff --git a/DESCRIPTION b/r/DESCRIPTION similarity index 100% rename from DESCRIPTION rename to r/DESCRIPTION diff --git a/LICENSE b/r/LICENSE similarity index 100% rename from LICENSE rename to r/LICENSE diff --git a/LICENSE.md b/r/LICENSE.md similarity index 100% rename from LICENSE.md rename to r/LICENSE.md diff --git a/NAMESPACE b/r/NAMESPACE similarity index 100% rename from NAMESPACE rename to r/NAMESPACE diff --git a/R/data-reviews.R b/r/R/data-reviews.R similarity index 100% rename from R/data-reviews.R rename to r/R/data-reviews.R diff --git a/R/import-standalone-purrr.R b/r/R/import-standalone-purrr.R similarity index 100% rename from R/import-standalone-purrr.R rename to r/R/import-standalone-purrr.R diff --git a/R/llm-classify.R b/r/R/llm-classify.R similarity index 100% rename from R/llm-classify.R rename to r/R/llm-classify.R diff --git a/R/llm-custom.R b/r/R/llm-custom.R similarity index 100% rename from R/llm-custom.R rename to r/R/llm-custom.R diff --git a/R/llm-extract.R b/r/R/llm-extract.R similarity index 100% rename from R/llm-extract.R rename to r/R/llm-extract.R diff --git a/R/llm-sentiment.R b/r/R/llm-sentiment.R similarity index 100% rename from R/llm-sentiment.R rename to r/R/llm-sentiment.R diff --git a/R/llm-summarize.R b/r/R/llm-summarize.R similarity index 100% rename from R/llm-summarize.R rename to r/R/llm-summarize.R diff --git a/R/llm-translate.R b/r/R/llm-translate.R similarity index 100% rename from R/llm-translate.R rename to r/R/llm-translate.R diff --git a/R/llm-use.R b/r/R/llm-use.R similarity index 100% rename from R/llm-use.R rename to r/R/llm-use.R diff --git a/R/m-backend-prompt.R b/r/R/m-backend-prompt.R similarity index 100% rename from R/m-backend-prompt.R rename to r/R/m-backend-prompt.R diff --git a/R/m-backend-submit.R b/r/R/m-backend-submit.R similarity index 100% rename from R/m-backend-submit.R rename to r/R/m-backend-submit.R diff --git a/R/m-cache.R b/r/R/m-cache.R similarity index 100% rename from R/m-cache.R rename to r/R/m-cache.R diff --git a/R/m-defaults.R b/r/R/m-defaults.R similarity index 100% rename from R/m-defaults.R rename to r/R/m-defaults.R diff --git a/R/m-vec-prompt.R b/r/R/m-vec-prompt.R similarity index 100% rename from R/m-vec-prompt.R rename to r/R/m-vec-prompt.R diff --git a/R/mall.R b/r/R/mall.R similarity index 100% rename from R/mall.R rename to r/R/mall.R diff --git a/R/utils.R b/r/R/utils.R similarity index 100% rename from R/utils.R rename to r/R/utils.R diff --git a/README.Rmd b/r/README.Rmd similarity index 100% rename from README.Rmd rename to r/README.Rmd diff --git a/README.md b/r/README.md similarity index 100% rename from README.md rename to r/README.md diff --git a/articles/caching.qmd b/r/articles/caching.qmd similarity index 100% rename from articles/caching.qmd rename to r/articles/caching.qmd diff --git a/articles/databricks.qmd b/r/articles/databricks.qmd similarity index 100% rename from articles/databricks.qmd rename to r/articles/databricks.qmd diff --git a/codecov.yml b/r/codecov.yml similarity index 100% rename from codecov.yml rename to r/codecov.yml diff --git a/data/reviews.rda b/r/data/reviews.rda similarity index 100% rename from data/reviews.rda rename to r/data/reviews.rda diff --git a/index.qmd b/r/index.qmd similarity index 100% rename from index.qmd rename to r/index.qmd diff --git a/mall.Rproj b/r/mall.Rproj similarity index 100% rename from mall.Rproj rename to r/mall.Rproj diff --git a/man/figures/favicon/apple-touch-icon-120x120.png b/r/man/figures/favicon/apple-touch-icon-120x120.png similarity index 100% rename from man/figures/favicon/apple-touch-icon-120x120.png rename to r/man/figures/favicon/apple-touch-icon-120x120.png diff --git a/man/figures/favicon/apple-touch-icon-152x152.png b/r/man/figures/favicon/apple-touch-icon-152x152.png similarity index 100% rename from man/figures/favicon/apple-touch-icon-152x152.png rename to r/man/figures/favicon/apple-touch-icon-152x152.png diff --git a/man/figures/favicon/apple-touch-icon-180x180.png b/r/man/figures/favicon/apple-touch-icon-180x180.png similarity index 100% rename from man/figures/favicon/apple-touch-icon-180x180.png rename to r/man/figures/favicon/apple-touch-icon-180x180.png diff --git a/man/figures/favicon/apple-touch-icon-60x60.png b/r/man/figures/favicon/apple-touch-icon-60x60.png similarity index 100% rename from man/figures/favicon/apple-touch-icon-60x60.png rename to r/man/figures/favicon/apple-touch-icon-60x60.png diff --git a/man/figures/favicon/apple-touch-icon-76x76.png b/r/man/figures/favicon/apple-touch-icon-76x76.png similarity index 100% rename from man/figures/favicon/apple-touch-icon-76x76.png rename to r/man/figures/favicon/apple-touch-icon-76x76.png diff --git a/man/figures/favicon/apple-touch-icon.png b/r/man/figures/favicon/apple-touch-icon.png similarity index 100% rename from man/figures/favicon/apple-touch-icon.png rename to r/man/figures/favicon/apple-touch-icon.png diff --git a/man/figures/favicon/favicon-16x16.png b/r/man/figures/favicon/favicon-16x16.png similarity index 100% rename from man/figures/favicon/favicon-16x16.png rename to r/man/figures/favicon/favicon-16x16.png diff --git a/man/figures/favicon/favicon-32x32.png b/r/man/figures/favicon/favicon-32x32.png similarity index 100% rename from man/figures/favicon/favicon-32x32.png rename to r/man/figures/favicon/favicon-32x32.png diff --git a/man/figures/favicon/favicon.ico b/r/man/figures/favicon/favicon.ico similarity index 100% rename from man/figures/favicon/favicon.ico rename to r/man/figures/favicon/favicon.ico diff --git a/man/figures/logo.png b/r/man/figures/logo.png similarity index 100% rename from man/figures/logo.png rename to r/man/figures/logo.png diff --git a/man/figures/mall.png b/r/man/figures/mall.png similarity index 100% rename from man/figures/mall.png rename to r/man/figures/mall.png diff --git a/man/llm_classify.Rd b/r/man/llm_classify.Rd similarity index 100% rename from man/llm_classify.Rd rename to r/man/llm_classify.Rd diff --git a/man/llm_custom.Rd b/r/man/llm_custom.Rd similarity index 100% rename from man/llm_custom.Rd rename to r/man/llm_custom.Rd diff --git a/man/llm_extract.Rd b/r/man/llm_extract.Rd similarity index 100% rename from man/llm_extract.Rd rename to r/man/llm_extract.Rd diff --git a/man/llm_sentiment.Rd b/r/man/llm_sentiment.Rd similarity index 100% rename from man/llm_sentiment.Rd rename to r/man/llm_sentiment.Rd diff --git a/man/llm_summarize.Rd b/r/man/llm_summarize.Rd similarity index 100% rename from man/llm_summarize.Rd rename to r/man/llm_summarize.Rd diff --git a/man/llm_translate.Rd b/r/man/llm_translate.Rd similarity index 100% rename from man/llm_translate.Rd rename to r/man/llm_translate.Rd diff --git a/man/llm_use.Rd b/r/man/llm_use.Rd similarity index 100% rename from man/llm_use.Rd rename to r/man/llm_use.Rd diff --git a/man/m_backend_submit.Rd b/r/man/m_backend_submit.Rd similarity index 100% rename from man/m_backend_submit.Rd rename to r/man/m_backend_submit.Rd diff --git a/man/reviews.Rd b/r/man/reviews.Rd similarity index 100% rename from man/reviews.Rd rename to r/man/reviews.Rd diff --git a/reference/index.qmd b/r/reference/index.qmd similarity index 100% rename from reference/index.qmd rename to r/reference/index.qmd diff --git a/reference/llm_classify.qmd b/r/reference/llm_classify.qmd similarity index 100% rename from reference/llm_classify.qmd rename to r/reference/llm_classify.qmd diff --git a/reference/llm_custom.qmd b/r/reference/llm_custom.qmd similarity index 100% rename from reference/llm_custom.qmd rename to r/reference/llm_custom.qmd diff --git a/reference/llm_extract.qmd b/r/reference/llm_extract.qmd similarity index 100% rename from reference/llm_extract.qmd rename to r/reference/llm_extract.qmd diff --git a/reference/llm_sentiment.qmd b/r/reference/llm_sentiment.qmd similarity index 100% rename from reference/llm_sentiment.qmd rename to r/reference/llm_sentiment.qmd diff --git a/reference/llm_summarize.qmd b/r/reference/llm_summarize.qmd similarity index 100% rename from reference/llm_summarize.qmd rename to r/reference/llm_summarize.qmd diff --git a/reference/llm_translate.qmd b/r/reference/llm_translate.qmd similarity index 100% rename from reference/llm_translate.qmd rename to r/reference/llm_translate.qmd diff --git a/reference/llm_use.qmd b/r/reference/llm_use.qmd similarity index 100% rename from reference/llm_use.qmd rename to r/reference/llm_use.qmd diff --git a/reference/m_backend_submit.qmd b/r/reference/m_backend_submit.qmd similarity index 100% rename from reference/m_backend_submit.qmd rename to r/reference/m_backend_submit.qmd diff --git a/reference/reviews.qmd b/r/reference/reviews.qmd similarity index 100% rename from reference/reviews.qmd rename to r/reference/reviews.qmd diff --git a/tests/testthat.R b/r/tests/testthat.R similarity index 100% rename from tests/testthat.R rename to r/tests/testthat.R diff --git a/tests/testthat/_snaps/llm-classify.md b/r/tests/testthat/_snaps/llm-classify.md similarity index 100% rename from tests/testthat/_snaps/llm-classify.md rename to r/tests/testthat/_snaps/llm-classify.md diff --git a/tests/testthat/_snaps/llm-custom.md b/r/tests/testthat/_snaps/llm-custom.md similarity index 100% rename from tests/testthat/_snaps/llm-custom.md rename to r/tests/testthat/_snaps/llm-custom.md diff --git a/tests/testthat/_snaps/llm-extract.md b/r/tests/testthat/_snaps/llm-extract.md similarity index 100% rename from tests/testthat/_snaps/llm-extract.md rename to r/tests/testthat/_snaps/llm-extract.md diff --git a/tests/testthat/_snaps/llm-sentiment.md b/r/tests/testthat/_snaps/llm-sentiment.md similarity index 100% rename from tests/testthat/_snaps/llm-sentiment.md rename to r/tests/testthat/_snaps/llm-sentiment.md diff --git a/tests/testthat/_snaps/llm-summarize.md b/r/tests/testthat/_snaps/llm-summarize.md similarity index 100% rename from tests/testthat/_snaps/llm-summarize.md rename to r/tests/testthat/_snaps/llm-summarize.md diff --git a/tests/testthat/_snaps/llm-translate.md b/r/tests/testthat/_snaps/llm-translate.md similarity index 100% rename from tests/testthat/_snaps/llm-translate.md rename to r/tests/testthat/_snaps/llm-translate.md diff --git a/tests/testthat/_snaps/llm-use.md b/r/tests/testthat/_snaps/llm-use.md similarity index 100% rename from tests/testthat/_snaps/llm-use.md rename to r/tests/testthat/_snaps/llm-use.md diff --git a/tests/testthat/_snaps/zzz-cache.md b/r/tests/testthat/_snaps/zzz-cache.md similarity index 100% rename from tests/testthat/_snaps/zzz-cache.md rename to r/tests/testthat/_snaps/zzz-cache.md diff --git a/tests/testthat/helper-ollama.R b/r/tests/testthat/helper-ollama.R similarity index 100% rename from tests/testthat/helper-ollama.R rename to r/tests/testthat/helper-ollama.R diff --git a/tests/testthat/test-llm-classify.R b/r/tests/testthat/test-llm-classify.R similarity index 100% rename from tests/testthat/test-llm-classify.R rename to r/tests/testthat/test-llm-classify.R diff --git a/tests/testthat/test-llm-custom.R b/r/tests/testthat/test-llm-custom.R similarity index 100% rename from tests/testthat/test-llm-custom.R rename to r/tests/testthat/test-llm-custom.R diff --git a/tests/testthat/test-llm-extract.R b/r/tests/testthat/test-llm-extract.R similarity index 100% rename from tests/testthat/test-llm-extract.R rename to r/tests/testthat/test-llm-extract.R diff --git a/tests/testthat/test-llm-sentiment.R b/r/tests/testthat/test-llm-sentiment.R similarity index 100% rename from tests/testthat/test-llm-sentiment.R rename to r/tests/testthat/test-llm-sentiment.R diff --git a/tests/testthat/test-llm-summarize.R b/r/tests/testthat/test-llm-summarize.R similarity index 100% rename from tests/testthat/test-llm-summarize.R rename to r/tests/testthat/test-llm-summarize.R diff --git a/tests/testthat/test-llm-translate.R b/r/tests/testthat/test-llm-translate.R similarity index 100% rename from tests/testthat/test-llm-translate.R rename to r/tests/testthat/test-llm-translate.R diff --git a/tests/testthat/test-llm-use.R b/r/tests/testthat/test-llm-use.R similarity index 100% rename from tests/testthat/test-llm-use.R rename to r/tests/testthat/test-llm-use.R diff --git a/tests/testthat/test-m-backend-prompt.R b/r/tests/testthat/test-m-backend-prompt.R similarity index 100% rename from tests/testthat/test-m-backend-prompt.R rename to r/tests/testthat/test-m-backend-prompt.R diff --git a/tests/testthat/test-m-backend-submit.R b/r/tests/testthat/test-m-backend-submit.R similarity index 100% rename from tests/testthat/test-m-backend-submit.R rename to r/tests/testthat/test-m-backend-submit.R diff --git a/tests/testthat/test-zzz-cache.R b/r/tests/testthat/test-zzz-cache.R similarity index 100% rename from tests/testthat/test-zzz-cache.R rename to r/tests/testthat/test-zzz-cache.R diff --git a/utils/knitr-print.R b/r/utils/knitr-print.R similarity index 100% rename from utils/knitr-print.R rename to r/utils/knitr-print.R diff --git a/utils/website/README.md b/r/utils/website/README.md similarity index 100% rename from utils/website/README.md rename to r/utils/website/README.md diff --git a/utils/website/_reference.qmd b/r/utils/website/_reference.qmd similarity index 100% rename from utils/website/_reference.qmd rename to r/utils/website/_reference.qmd diff --git a/utils/website/build_reference.R b/r/utils/website/build_reference.R similarity index 100% rename from utils/website/build_reference.R rename to r/utils/website/build_reference.R diff --git a/utils/website/index-page.R b/r/utils/website/index-page.R similarity index 100% rename from utils/website/index-page.R rename to r/utils/website/index-page.R diff --git a/utils/website/list-to-qmd.R b/r/utils/website/list-to-qmd.R similarity index 100% rename from utils/website/list-to-qmd.R rename to r/utils/website/list-to-qmd.R diff --git a/utils/website/rd-to-list.R b/r/utils/website/rd-to-list.R similarity index 100% rename from utils/website/rd-to-list.R rename to r/utils/website/rd-to-list.R From d42f8b18215e5718955990e13e1f6db6cf62838b Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Tue, 1 Oct 2024 15:55:07 -0500 Subject: [PATCH 02/57] Initial working implementation --- .gitignore | 1 + python/MANIFEST.in | 1 + python/README.md | 0 python/mall/__init__.py | 3 + python/mall/llm.py | 32 +++++++++++ python/pyproject.toml | 14 +++++ python/uv.lock | 120 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 171 insertions(+) create mode 100644 python/MANIFEST.in create mode 100644 python/README.md create mode 100644 python/mall/__init__.py create mode 100644 python/mall/llm.py create mode 100644 python/pyproject.toml create mode 100644 python/uv.lock diff --git a/.gitignore b/.gitignore index 5099731..c825b24 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,4 @@ rsconnect/ docs/ +python/mall/src/ diff --git a/python/MANIFEST.in b/python/MANIFEST.in new file mode 100644 index 0000000..c08dce5 --- /dev/null +++ b/python/MANIFEST.in @@ -0,0 +1 @@ +graft mall diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..e69de29 diff --git a/python/mall/__init__.py b/python/mall/__init__.py new file mode 100644 index 0000000..80ea08f --- /dev/null +++ b/python/mall/__init__.py @@ -0,0 +1,3 @@ +__all__ = ["MallFrame"] + +from mall.llm import MallFrame diff --git a/python/mall/llm.py b/python/mall/llm.py new file mode 100644 index 0000000..fb39509 --- /dev/null +++ b/python/mall/llm.py @@ -0,0 +1,32 @@ +import polars as pl +import ollama + + +@pl.api.register_dataframe_namespace("llm") +class MallFrame: + def __init__(self, df: pl.DataFrame) -> None: + self._df = df + + def sentiment(self, col, pred_name="sentiment") -> list[pl.DataFrame]: + prompt = ( + "You are a helpful sentiment engine. Return only one of the following" + + " answers: positive, negative, neutral. No capitalization. No explanations. " + + "The answer is based on the following text:\n" + ) + df = self._df.with_columns( + pl.col(col) + .map_elements( + lambda x: ollama.chat( + model="llama3.2", + messages=[ + { + "role": "user", + "content": prompt + x, + } + ], + )["message"]["content"], + return_dtype=pl.String, + ) + .alias(pred_name) + ) + return df diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 0000000..277be4c --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,14 @@ +[project] +name = "mall" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "ollama>=0.3.3", + "polars>=1.9.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" diff --git a/python/uv.lock b/python/uv.lock new file mode 100644 index 0000000..e752db4 --- /dev/null +++ b/python/uv.lock @@ -0,0 +1,120 @@ +version = 1 +requires-python = ">=3.12" + +[[package]] +name = "anyio" +version = "4.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "sniffio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/49/f3f17ec11c4a91fe79275c426658e509b07547f874b14c1a526d86a83fc8/anyio-4.6.0.tar.gz", hash = "sha256:137b4559cbb034c477165047febb6ff83f390fc3b20bf181c1fc0a728cb8beeb", size = 170983 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/ef/7a4f225581a0d7886ea28359179cb861d7fbcdefad29663fc1167b86f69f/anyio-4.6.0-py3-none-any.whl", hash = "sha256:c7d2e9d63e31599eeb636c8c5c03a7e108d73b345f064f1c19fdc87b79036a9a", size = 89631 }, +] + +[[package]] +name = "certifi" +version = "2024.8.30" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/ee/9b19140fe824b367c04c5e1b369942dd754c4c5462d5674002f75c4dedc1/certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9", size = 168507 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/90/3c9ff0512038035f59d279fddeb79f5f1eccd8859f06d6163c58798b9487/certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8", size = 167321 }, +] + +[[package]] +name = "h11" +version = "0.14.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f5/38/3af3d3633a34a3316095b39c8e8fb4853a28a536e55d347bd8d8e9a14b03/h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d", size = 100418 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/04/ff642e65ad6b90db43e668d70ffb6736436c7ce41fcc549f4e9472234127/h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", size = 58259 }, +] + +[[package]] +name = "httpcore" +version = "1.0.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/44/ed0fa6a17845fb033bd885c03e842f08c1b9406c86a2e60ac1ae1b9206a6/httpcore-1.0.6.tar.gz", hash = "sha256:73f6dbd6eb8c21bbf7ef8efad555481853f5f6acdeaff1edb0694289269ee17f", size = 85180 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/89/b161908e2f51be56568184aeb4a880fd287178d176fd1c860d2217f41106/httpcore-1.0.6-py3-none-any.whl", hash = "sha256:27b59625743b85577a8c0e10e55b50b5368a4f2cfe8cc7bcfa9cf00829c2682f", size = 78011 }, +] + +[[package]] +name = "httpx" +version = "0.27.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, + { name = "sniffio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/82/08f8c936781f67d9e6b9eeb8a0c8b4e406136ea4c3d1f89a5db71d42e0e6/httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2", size = 144189 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/56/95/9377bcb415797e44274b51d46e3249eba641711cf3348050f76ee7b15ffc/httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0", size = 76395 }, +] + +[[package]] +name = "idna" +version = "3.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, +] + +[[package]] +name = "mall" +version = "0.1.0" +source = { editable = "." } +dependencies = [ + { name = "ollama" }, + { name = "polars" }, +] + +[package.metadata] +requires-dist = [ + { name = "ollama", specifier = ">=0.3.3" }, + { name = "polars" }, +] + +[[package]] +name = "ollama" +version = "0.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/8e/60a9b065eb796ef3996451cbe2d8044f6b030696166693b9805ae33b8b4c/ollama-0.3.3.tar.gz", hash = "sha256:f90a6d61803117f40b0e8ff17465cab5e1eb24758a473cfe8101aff38bc13b51", size = 10390 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/ca/d22905ac3f768523f778189d38c9c6cd9edf4fa9dd09cb5a3fc57b184f90/ollama-0.3.3-py3-none-any.whl", hash = "sha256:ca6242ce78ab34758082b7392df3f9f6c2cb1d070a9dede1a4c545c929e16dba", size = 10267 }, +] + +[[package]] +name = "polars" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/53/09/c2fb0b231d551e0c8e68097d08577712bdff1ba91346cda8228e769602f5/polars-1.9.0.tar.gz", hash = "sha256:8e1206ef876f61c1d50a81e102611ea92ee34631cb135b46ad314bfefd3cb122", size = 4027431 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/cc/3d0292048d8f9045a03510aeecda2e6ed9df451ae8853274946ff841f98b/polars-1.9.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:a471d2ce96f6fa5dd0ef16bcdb227f3dbe3af8acb776ca52f9e64ef40c7489a0", size = 31870933 }, + { url = "https://files.pythonhosted.org/packages/ee/be/15af97f4d8b775630da16a8bf0141507d9c0ae5f2637b9a27ed337b3b1ba/polars-1.9.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:94b12d731cd200d2c50b13fc070d6353f708e632bca6529c5a72aa6a69e5285d", size = 28171055 }, + { url = "https://files.pythonhosted.org/packages/bb/57/b286b317f061d8f17bab4726a27e7b185fbf3d3db65cf689074256ea34a9/polars-1.9.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f85f132732aa63c6f3b502b0fdfc3ba9f0b78cc6330059b5a2d6f9fd78508acb", size = 33063367 }, + { url = "https://files.pythonhosted.org/packages/e5/25/bf5d43dcb538bf6573b15f3d5995a52be61b8fbce0cd737e72c4d25eef88/polars-1.9.0-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:f753c8941a3b3249d59262d68a856714a96a7d4e16977aefbb196be0c192e151", size = 29764698 }, + { url = "https://files.pythonhosted.org/packages/a6/cf/f9170a3ac20e0efb9d3c1cdacc677e35b711ffd5ec48a6d5f3da7b7d8663/polars-1.9.0-cp38-abi3-win_amd64.whl", hash = "sha256:95de07066cd797dd940fa2783708a7bef93c827a57be0f4dfad3575a6144212b", size = 32819142 }, +] + +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 }, +] From dd5817bef39d351115f31f8d7e1c132a1a1c1f8c Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Tue, 1 Oct 2024 16:21:55 -0500 Subject: [PATCH 03/57] Adds README with working code --- python/README.md | 32 ++++++++++++++++++++++++++++++++ python/README.qmd | 17 +++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 python/README.qmd diff --git a/python/README.md b/python/README.md index e69de29..8f64cdf 100644 --- a/python/README.md +++ b/python/README.md @@ -0,0 +1,32 @@ + + +# mall + +``` python +import mall +import polars as tp + +df = tp.DataFrame( + data=["I am happy", "I am sad"], + schema=[("txt", tp.String)], +) + +df.llm.sentiment("txt") +``` + +
+shape: (2, 2) + +| txt | sentiment | +|--------------|------------| +| str | str | +| "I am happy" | "positive" | +| "I am sad" | "negative" | + +
diff --git a/python/README.qmd b/python/README.qmd new file mode 100644 index 0000000..d901918 --- /dev/null +++ b/python/README.qmd @@ -0,0 +1,17 @@ +--- +format: gfm +--- + +# mall + +```{python} +import mall +import polars as tp + +df = tp.DataFrame( + data=["I am happy", "I am sad"], + schema=[("txt", tp.String)], +) + +df.llm.sentiment("txt") +``` \ No newline at end of file From 06c143be5c8e632a4a48cb8d47968f14f09e9900 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Wed, 2 Oct 2024 14:53:53 -0500 Subject: [PATCH 04/57] starts prompt script --- python/mall/llm.py | 1 + python/mall/prompt.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 python/mall/prompt.py diff --git a/python/mall/llm.py b/python/mall/llm.py index fb39509..35fad13 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -22,6 +22,7 @@ def sentiment(self, col, pred_name="sentiment") -> list[pl.DataFrame]: { "role": "user", "content": prompt + x, + } ], )["message"]["content"], diff --git a/python/mall/prompt.py b/python/mall/prompt.py new file mode 100644 index 0000000..7f37d62 --- /dev/null +++ b/python/mall/prompt.py @@ -0,0 +1,20 @@ +def process_labels(x, if_characters = "", if_formula = ""): + return if_characters.format(x) + +def sentiment(options, additional = ""): + new_options = process_labels( + options, + "Return only one of the following answers: {}", + "- If the text is {f_lhs(x)}, return {f_rhs(x)}" + ) + msg = [ + { + "role": "user", + "content": "You are a helpful sentiment engine. " + + f"{new_options}. " + + "No capitalization. No explanations. " + + f"{additional} " + + "The answer is based on the following text:\n{}", + } + ] + return msg From ec4a243fe94687b33bea71af1ce58d185d7d3e96 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Wed, 2 Oct 2024 15:40:34 -0500 Subject: [PATCH 05/57] Starts switching sentiment to function --- python/mall/llm.py | 25 +++++++++++-------------- python/mall/prompt.py | 7 ++++--- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index 35fad13..dcac249 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -1,5 +1,6 @@ import polars as pl import ollama +from mall.prompt import sentiment @pl.api.register_dataframe_namespace("llm") @@ -7,24 +8,20 @@ class MallFrame: def __init__(self, df: pl.DataFrame) -> None: self._df = df - def sentiment(self, col, pred_name="sentiment") -> list[pl.DataFrame]: - prompt = ( - "You are a helpful sentiment engine. Return only one of the following" - + " answers: positive, negative, neutral. No capitalization. No explanations. " - + "The answer is based on the following text:\n" - ) + def sentiment( + self, + col, + additional="", + options="positive, negative", + pred_name="sentiment", + ) -> list[pl.DataFrame]: + msg = sentiment(options, additional=additional) df = self._df.with_columns( - pl.col(col) + pl.col(col) .map_elements( lambda x: ollama.chat( model="llama3.2", - messages=[ - { - "role": "user", - "content": prompt + x, - - } - ], + messages= msg.format(x), )["message"]["content"], return_dtype=pl.String, ) diff --git a/python/mall/prompt.py b/python/mall/prompt.py index 7f37d62..11a79ae 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -1,11 +1,12 @@ -def process_labels(x, if_characters = "", if_formula = ""): +def process_labels(x, if_characters="", if_list=""): return if_characters.format(x) -def sentiment(options, additional = ""): + +def sentiment(options, additional=""): new_options = process_labels( options, "Return only one of the following answers: {}", - "- If the text is {f_lhs(x)}, return {f_rhs(x)}" + "- If the text is {f_lhs(x)}, return {f_rhs(x)}", ) msg = [ { From 67dd2a4509a44f5f4c87f431607fa4d9fdb9a07e Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Wed, 2 Oct 2024 16:31:03 -0500 Subject: [PATCH 06/57] Adds build_msg and llm_call --- python/mall/llm.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index dcac249..ad723e1 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -3,6 +3,19 @@ from mall.prompt import sentiment +def build_msg(x, msg): + out = [] + for msgs in msg: + out.append({"role": msgs["role"], "content": msgs["content"].format(x)}) + return out + + +def llm_call(x, msg): + out = ollama.chat(model="llama3.2", messages=build_msg(x, msg)) + out = out["message"]["content"] + return out + + @pl.api.register_dataframe_namespace("llm") class MallFrame: def __init__(self, df: pl.DataFrame) -> None: @@ -17,12 +30,9 @@ def sentiment( ) -> list[pl.DataFrame]: msg = sentiment(options, additional=additional) df = self._df.with_columns( - pl.col(col) + pl.col(col) .map_elements( - lambda x: ollama.chat( - model="llama3.2", - messages= msg.format(x), - )["message"]["content"], + lambda x: llm_call(x, msg), return_dtype=pl.String, ) .alias(pred_name) From 1a99706181ce66d20ebe4059f606e4f54b7cadf7 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Wed, 2 Oct 2024 18:29:54 -0500 Subject: [PATCH 07/57] Starts use(), improves sentiment output --- python/mall/llm.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index ad723e1..66231af 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -20,7 +20,10 @@ def llm_call(x, msg): class MallFrame: def __init__(self, df: pl.DataFrame) -> None: self._df = df - + self._use = {"backend": "ollama", "model":"llama3.2"} + def use(self, backend = "", model = "", **kwars): + print(self._use) + return self._df def sentiment( self, col, @@ -29,12 +32,12 @@ def sentiment( pred_name="sentiment", ) -> list[pl.DataFrame]: msg = sentiment(options, additional=additional) - df = self._df.with_columns( + self._df = self._df.with_columns( pl.col(col) .map_elements( lambda x: llm_call(x, msg), return_dtype=pl.String, ) .alias(pred_name) - ) - return df + ) + return self._df From b5c465f8801d4041517d47fd8122fe89a8c0dfe1 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Wed, 2 Oct 2024 19:01:06 -0500 Subject: [PATCH 08/57] Starts using _use elements --- python/mall/llm.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index 66231af..4fb9fca 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -10,9 +10,10 @@ def build_msg(x, msg): return out -def llm_call(x, msg): - out = ollama.chat(model="llama3.2", messages=build_msg(x, msg)) - out = out["message"]["content"] +def llm_call(x, msg, backend, model): + if backend == "ollama": + resp = ollama.chat(model=model, messages=build_msg(x, msg)) + out = resp["message"]["content"] return out @@ -20,10 +21,12 @@ def llm_call(x, msg): class MallFrame: def __init__(self, df: pl.DataFrame) -> None: self._df = df - self._use = {"backend": "ollama", "model":"llama3.2"} - def use(self, backend = "", model = "", **kwars): + self._use = {"backend": "ollama", "model": "llama3.2"} + + def use(self, backend="", model="", **kwars): print(self._use) return self._df + def sentiment( self, col, @@ -32,12 +35,14 @@ def sentiment( pred_name="sentiment", ) -> list[pl.DataFrame]: msg = sentiment(options, additional=additional) + backend = self._use["backend"] + model = self._use["model"] self._df = self._df.with_columns( pl.col(col) .map_elements( - lambda x: llm_call(x, msg), + lambda x: llm_call(x, msg, backend, model), return_dtype=pl.String, ) .alias(pred_name) - ) + ) return self._df From 8195391ea256856bd1eea267db5748738c7ee66c Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Thu, 3 Oct 2024 09:59:13 -0500 Subject: [PATCH 09/57] Use now records kwargs properly --- python/mall/llm.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index 4fb9fca..25835b5 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -21,9 +21,14 @@ def llm_call(x, msg, backend, model): class MallFrame: def __init__(self, df: pl.DataFrame) -> None: self._df = df - self._use = {"backend": "ollama", "model": "llama3.2"} - - def use(self, backend="", model="", **kwars): + self._use = {"backend": "ollama", "model": "llama3.2"} + + def use(self, backend="", model="", **kwargs): + if(backend != ""): + self._use = {"backend": backend, "model": self._use["model"]} + if(model != ""): + self._use = {"backend": self._use["backend"], "model": model} + self._use.update(dict(kwargs)) print(self._use) return self._df From 66a174b7a8ec14e49b1948287e9f2326aa84c54c Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Thu, 3 Oct 2024 11:46:53 -0500 Subject: [PATCH 10/57] Adds support for options --- python/mall/llm.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index 25835b5..7b6f70c 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -10,9 +10,13 @@ def build_msg(x, msg): return out -def llm_call(x, msg, backend, model): - if backend == "ollama": - resp = ollama.chat(model=model, messages=build_msg(x, msg)) +def llm_call(x, msg, use): + if use.get("backend"): + resp = ollama.chat( + model=use.get("model"), + messages=build_msg(x, msg), + options=use.get("options"), + ) out = resp["message"]["content"] return out @@ -21,12 +25,12 @@ def llm_call(x, msg, backend, model): class MallFrame: def __init__(self, df: pl.DataFrame) -> None: self._df = df - self._use = {"backend": "ollama", "model": "llama3.2"} + self._use = {"backend": "ollama", "model": "llama3.2"} def use(self, backend="", model="", **kwargs): - if(backend != ""): + if backend != "": self._use = {"backend": backend, "model": self._use["model"]} - if(model != ""): + if model != "": self._use = {"backend": self._use["backend"], "model": model} self._use.update(dict(kwargs)) print(self._use) @@ -45,7 +49,7 @@ def sentiment( self._df = self._df.with_columns( pl.col(col) .map_elements( - lambda x: llm_call(x, msg, backend, model), + lambda x: llm_call(x, msg, self._use), return_dtype=pl.String, ) .alias(pred_name) From 5aaddf1133134d82984422c85ddfd4873d2f4a94 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Thu, 3 Oct 2024 12:30:30 -0500 Subject: [PATCH 11/57] Uses list for regular options --- python/mall/llm.py | 4 +--- python/mall/prompt.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index 7b6f70c..e5dc946 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -40,12 +40,10 @@ def sentiment( self, col, additional="", - options="positive, negative", + options=["positive", "negative"], pred_name="sentiment", ) -> list[pl.DataFrame]: msg = sentiment(options, additional=additional) - backend = self._use["backend"] - model = self._use["model"] self._df = self._df.with_columns( pl.col(col) .map_elements( diff --git a/python/mall/prompt.py b/python/mall/prompt.py index 11a79ae..1797e6d 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -1,5 +1,10 @@ -def process_labels(x, if_characters="", if_list=""): - return if_characters.format(x) +def process_labels(x, if_list="", if_dict=""): + if type(x) == "list": + out = "" + for i in x: + out += " " + i + out = out.strip() + return out.replace(" ", ", ") def sentiment(options, additional=""): @@ -19,3 +24,5 @@ def sentiment(options, additional=""): } ] return msg + + From d66832b6f22f18def11beda1b6ccdb17791f8306 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Thu, 3 Oct 2024 13:19:23 -0500 Subject: [PATCH 12/57] Improvements to list check --- python/mall/prompt.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/mall/prompt.py b/python/mall/prompt.py index 1797e6d..42c2805 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -1,17 +1,18 @@ def process_labels(x, if_list="", if_dict=""): - if type(x) == "list": + if isinstance(x, list): out = "" for i in x: out += " " + i out = out.strip() - return out.replace(" ", ", ") + out = out.replace(" ", ", ") + out = if_list.replace("{values}", out) def sentiment(options, additional=""): new_options = process_labels( options, - "Return only one of the following answers: {}", - "- If the text is {f_lhs(x)}, return {f_rhs(x)}", + "Return only one of the following answers: {values}", + "- If the text is {key}, return {value}", ) msg = [ { @@ -24,5 +25,3 @@ def sentiment(options, additional=""): } ] return msg - - From 8561c93a8423bc4e3a480cae5a50329da4b61437 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Thu, 3 Oct 2024 14:10:48 -0500 Subject: [PATCH 13/57] Moves polars code to script --- python/mall/__init__.py | 2 +- python/mall/llm.py | 36 ------------------------------------ python/mall/polars.py | 36 ++++++++++++++++++++++++++++++++++++ python/mall/prompt.py | 2 +- 4 files changed, 38 insertions(+), 38 deletions(-) create mode 100644 python/mall/polars.py diff --git a/python/mall/__init__.py b/python/mall/__init__.py index 80ea08f..4125a70 100644 --- a/python/mall/__init__.py +++ b/python/mall/__init__.py @@ -1,3 +1,3 @@ __all__ = ["MallFrame"] -from mall.llm import MallFrame +from mall.polars import MallFrame diff --git a/python/mall/llm.py b/python/mall/llm.py index e5dc946..d042bb2 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -1,6 +1,4 @@ -import polars as pl import ollama -from mall.prompt import sentiment def build_msg(x, msg): @@ -19,37 +17,3 @@ def llm_call(x, msg, use): ) out = resp["message"]["content"] return out - - -@pl.api.register_dataframe_namespace("llm") -class MallFrame: - def __init__(self, df: pl.DataFrame) -> None: - self._df = df - self._use = {"backend": "ollama", "model": "llama3.2"} - - def use(self, backend="", model="", **kwargs): - if backend != "": - self._use = {"backend": backend, "model": self._use["model"]} - if model != "": - self._use = {"backend": self._use["backend"], "model": model} - self._use.update(dict(kwargs)) - print(self._use) - return self._df - - def sentiment( - self, - col, - additional="", - options=["positive", "negative"], - pred_name="sentiment", - ) -> list[pl.DataFrame]: - msg = sentiment(options, additional=additional) - self._df = self._df.with_columns( - pl.col(col) - .map_elements( - lambda x: llm_call(x, msg, self._use), - return_dtype=pl.String, - ) - .alias(pred_name) - ) - return self._df diff --git a/python/mall/polars.py b/python/mall/polars.py new file mode 100644 index 0000000..54774d5 --- /dev/null +++ b/python/mall/polars.py @@ -0,0 +1,36 @@ +import polars as pl +from mall.prompt import sentiment +from mall.llm import llm_call + +@pl.api.register_dataframe_namespace("llm") +class MallFrame: + def __init__(self, df: pl.DataFrame) -> None: + self._df = df + self._use = {"backend": "ollama", "model": "llama3.2"} + + def use(self, backend="", model="", **kwargs): + if backend != "": + self._use = {"backend": backend, "model": self._use["model"]} + if model != "": + self._use = {"backend": self._use["backend"], "model": model} + self._use.update(dict(kwargs)) + print(self._use) + return self._df + + def sentiment( + self, + col, + additional="", + options=["positive", "negative"], + pred_name="sentiment", + ) -> list[pl.DataFrame]: + msg = sentiment(options, additional=additional) + self._df = self._df.with_columns( + pl.col(col) + .map_elements( + lambda x: llm_call(x, msg, self._use), + return_dtype=pl.String, + ) + .alias(pred_name) + ) + return self._df diff --git a/python/mall/prompt.py b/python/mall/prompt.py index 42c2805..7a411e3 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -6,7 +6,7 @@ def process_labels(x, if_list="", if_dict=""): out = out.strip() out = out.replace(" ", ", ") out = if_list.replace("{values}", out) - + return out def sentiment(options, additional=""): new_options = process_labels( From 3ee4a0fdfe083d72d1aa3e84aa8f0fb9a657807a Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Thu, 3 Oct 2024 14:26:06 -0500 Subject: [PATCH 14/57] Updates README --- python/README.md | 14 +++++--------- python/README.qmd | 8 +++++++- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/python/README.md b/python/README.md index 8f64cdf..c7e7d4c 100644 --- a/python/README.md +++ b/python/README.md @@ -2,6 +2,10 @@ # mall +``` python +pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python" +``` + ``` python import mall import polars as tp @@ -14,14 +18,6 @@ df = tp.DataFrame( df.llm.sentiment("txt") ``` -
-shape: (2, 2) | txt | sentiment | |--------------|------------| @@ -29,4 +25,4 @@ df.llm.sentiment("txt") | "I am happy" | "positive" | | "I am sad" | "negative" | -
+ diff --git a/python/README.qmd b/python/README.qmd index d901918..9060f76 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -4,6 +4,12 @@ format: gfm # mall + + +```python +pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python" +``` + ```{python} import mall import polars as tp @@ -14,4 +20,4 @@ df = tp.DataFrame( ) df.llm.sentiment("txt") -``` \ No newline at end of file +``` From b1e79a5796f11c8c7715b9857ac31e9668aef651 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Thu, 3 Oct 2024 14:42:51 -0500 Subject: [PATCH 15/57] Updates README --- python/README.md | 41 +++++++++++++++++++++++++++-------------- python/README.qmd | 14 +++++++++----- python/mall/polars.py | 2 +- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/python/README.md b/python/README.md index c7e7d4c..d5e0358 100644 --- a/python/README.md +++ b/python/README.md @@ -8,21 +8,34 @@ pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdir ``` python import mall -import polars as tp - -df = tp.DataFrame( - data=["I am happy", "I am sad"], - schema=[("txt", tp.String)], +import polars as pl + +reviews = pl.DataFrame( + data=[ + "This has been the best TV I've ever used. Great screen, and sound.", + "I regret buying this laptop. It is too slow and the keyboard is too noisy", + "Not sure how to feel about my new washing machine. Great color, but hard to figure" + ], + schema=[("review", pl.String)], ) -df.llm.sentiment("txt") +reviews.llm.sentiment("review") ``` - -| txt | sentiment | -|--------------|------------| -| str | str | -| "I am happy" | "positive" | -| "I am sad" | "negative" | - - +
+shape: (3, 2) + +| review | sentiment | +|----------------------------------|------------| +| str | str | +| "This has been the best TV I've… | "positive" | +| "I regret buying this laptop. I… | "negative" | +| "Not sure how to feel about my … | "neutral" | + +
diff --git a/python/README.qmd b/python/README.qmd index 9060f76..2a409bb 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -12,12 +12,16 @@ pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdir ```{python} import mall -import polars as tp +import polars as pl -df = tp.DataFrame( - data=["I am happy", "I am sad"], - schema=[("txt", tp.String)], +reviews = pl.DataFrame( + data=[ + "This has been the best TV I've ever used. Great screen, and sound.", + "I regret buying this laptop. It is too slow and the keyboard is too noisy", + "Not sure how to feel about my new washing machine. Great color, but hard to figure" + ], + schema=[("review", pl.String)], ) -df.llm.sentiment("txt") +reviews.llm.sentiment("review") ``` diff --git a/python/mall/polars.py b/python/mall/polars.py index 54774d5..148294c 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -21,7 +21,7 @@ def sentiment( self, col, additional="", - options=["positive", "negative"], + options=["positive", "negative", "neutral"], pred_name="sentiment", ) -> list[pl.DataFrame]: msg = sentiment(options, additional=additional) From 5863bf93f3774f17fa709b3b761f95d4daed6eb5 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Thu, 3 Oct 2024 14:49:03 -0500 Subject: [PATCH 16/57] Test markdown output --- python/README.md | 22 ++++++++++++++-------- python/README.qmd | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/python/README.md b/python/README.md index d5e0358..d393386 100644 --- a/python/README.md +++ b/python/README.md @@ -1,4 +1,6 @@ - +--- +toc-title: Table of contents +--- # mall @@ -6,7 +8,8 @@ pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python" ``` -``` python +:::: {.cell execution_count="1"} +``` {.python .cell-code} import mall import polars as pl @@ -22,6 +25,7 @@ reviews = pl.DataFrame( reviews.llm.sentiment("review") ``` +::: {.cell-output .cell-output-display execution_count="3"}
shape: (3, 2) -| review | sentiment | -|----------------------------------|------------| -| str | str | -| "This has been the best TV I've… | "positive" | -| "I regret buying this laptop. I… | "negative" | -| "Not sure how to feel about my … | "neutral" | + review sentiment + -------------------------------------- -------------- + str str + \"This has been the best TV I\'ve... \"positive\" + \"I regret buying this laptop. I... \"negative\" + \"Not sure how to feel about my ... \"neutral\"
+::: +:::: diff --git a/python/README.qmd b/python/README.qmd index 2a409bb..0b15ddb 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -1,5 +1,5 @@ --- -format: gfm +format: markdown --- # mall From 8e9b1fa9f0267e370c9782ef18fcc6ea5831e2c1 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Thu, 3 Oct 2024 15:06:44 -0500 Subject: [PATCH 17/57] Adds a markdown function solution --- python/README.md | 37 +++++++++++-------------------------- python/README.qmd | 12 ++++++++++-- 2 files changed, 21 insertions(+), 28 deletions(-) diff --git a/python/README.md b/python/README.md index d393386..ea30343 100644 --- a/python/README.md +++ b/python/README.md @@ -1,6 +1,4 @@ ---- -toc-title: Table of contents ---- + # mall @@ -8,8 +6,7 @@ toc-title: Table of contents pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python" ``` -:::: {.cell execution_count="1"} -``` {.python .cell-code} +``` python import mall import polars as pl @@ -21,27 +18,15 @@ reviews = pl.DataFrame( ], schema=[("review", pl.String)], ) +``` -reviews.llm.sentiment("review") +``` python +def markdown(x): + return x.to_pandas().to_markdown() +``` + +``` python +markdown(reviews.llm.sentiment("review")) ``` -::: {.cell-output .cell-output-display execution_count="3"} -
-shape: (3, 2) - - review sentiment - -------------------------------------- -------------- - str str - \"This has been the best TV I\'ve... \"positive\" - \"I regret buying this laptop. I... \"negative\" - \"Not sure how to feel about my ... \"neutral\" - -
-::: -:::: + "| | review | sentiment |\n|---:|:-----------------------------------------------------------------------------------|:------------|\n| 0 | This has been the best TV I've ever used. Great screen, and sound. | positive |\n| 1 | I regret buying this laptop. It is too slow and the keyboard is too noisy | negative |\n| 2 | Not sure how to feel about my new washing machine. Great color, but hard to figure | neutral |" diff --git a/python/README.qmd b/python/README.qmd index 0b15ddb..d713060 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -1,5 +1,5 @@ --- -format: markdown +format: gfm --- # mall @@ -22,6 +22,14 @@ reviews = pl.DataFrame( ], schema=[("review", pl.String)], ) +``` + +```{python} +def markdown(x): + return x.to_pandas().to_markdown() +``` + -reviews.llm.sentiment("review") +```{python} +markdown(reviews.llm.sentiment("review")) ``` From b863e48e0cb16aeb3ccded89c5a83b331e680d64 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Thu, 3 Oct 2024 16:33:48 -0500 Subject: [PATCH 18/57] Adds summarise --- python/mall/llm.py | 5 +++++ python/mall/polars.py | 22 ++++++++++++++++++++-- python/mall/prompt.py | 15 +++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index d042bb2..4b3a6ee 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -10,6 +10,11 @@ def build_msg(x, msg): def llm_call(x, msg, use): if use.get("backend"): + # print(dict( + # model=use.get("model"), + # messages=build_msg(x, msg), + # options=use.get("options"), + # )) resp = ollama.chat( model=use.get("model"), messages=build_msg(x, msg), diff --git a/python/mall/polars.py b/python/mall/polars.py index 148294c..cab0a8d 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -1,5 +1,5 @@ import polars as pl -from mall.prompt import sentiment +from mall.prompt import sentiment, summarize from mall.llm import llm_call @pl.api.register_dataframe_namespace("llm") @@ -20,8 +20,8 @@ def use(self, backend="", model="", **kwargs): def sentiment( self, col, - additional="", options=["positive", "negative", "neutral"], + additional="", pred_name="sentiment", ) -> list[pl.DataFrame]: msg = sentiment(options, additional=additional) @@ -34,3 +34,21 @@ def sentiment( .alias(pred_name) ) return self._df + + def summarize( + self, + col, + max_words=10, + additional="", + pred_name="summary", + ) -> list[pl.DataFrame]: + msg = summarize(max_words, additional=additional) + self._df = self._df.with_columns( + pl.col(col) + .map_elements( + lambda x: llm_call(x, msg, self._use), + return_dtype=pl.String, + ) + .alias(pred_name) + ) + return self._df diff --git a/python/mall/prompt.py b/python/mall/prompt.py index 7a411e3..4cee522 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -8,6 +8,7 @@ def process_labels(x, if_list="", if_dict=""): out = if_list.replace("{values}", out) return out + def sentiment(options, additional=""): new_options = process_labels( options, @@ -25,3 +26,17 @@ def sentiment(options, additional=""): } ] return msg + + +def summarize(max_words, additional=""): + msg = [ + { + "role": "user", + "content": "You are a helpful summarization engine." + + "Your answer will contain no no capitalization and no explanations." + + f"Return no more than " + str(max_words) + " words." + + f"{additional}" + + "The answer is the summary of the following text:\n{}", + } + ] + return msg From c35d2214a9eca4bcfe22898fd94245e50608f9be Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Thu, 3 Oct 2024 16:36:32 -0500 Subject: [PATCH 19/57] Updates README --- python/README.md | 41 +++++++++++++++++++++++++++++++++++++---- python/README.qmd | 8 +++----- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/python/README.md b/python/README.md index ea30343..3b8e897 100644 --- a/python/README.md +++ b/python/README.md @@ -21,12 +21,45 @@ reviews = pl.DataFrame( ``` ``` python -def markdown(x): - return x.to_pandas().to_markdown() +reviews.llm.sentiment("review") ``` +
+shape: (3, 2) + +| review | sentiment | +|----------------------------------|------------| +| str | str | +| "This has been the best TV I've… | "positive" | +| "I regret buying this laptop. I… | "negative" | +| "Not sure how to feel about my … | "negative" | + +
+ ``` python -markdown(reviews.llm.sentiment("review")) +reviews.llm.summarize("review", 5) ``` - "| | review | sentiment |\n|---:|:-----------------------------------------------------------------------------------|:------------|\n| 0 | This has been the best TV I've ever used. Great screen, and sound. | positive |\n| 1 | I regret buying this laptop. It is too slow and the keyboard is too noisy | negative |\n| 2 | Not sure how to feel about my new washing machine. Great color, but hard to figure | neutral |" +
+shape: (3, 3) + +| review | sentiment | summary | +|----|----|----| +| str | str | str | +| "This has been the best TV I've… | "positive" | "best tv experience ever" | +| "I regret buying this laptop. I… | "negative" | "laptop purchase was a mistake" | +| "Not sure how to feel about my … | "negative" | "neutral about the washing mach… | + +
diff --git a/python/README.qmd b/python/README.qmd index d713060..4fb5db3 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -25,11 +25,9 @@ reviews = pl.DataFrame( ``` ```{python} -def markdown(x): - return x.to_pandas().to_markdown() +reviews.llm.sentiment("review") ``` - ```{python} -markdown(reviews.llm.sentiment("review")) -``` +reviews.llm.summarize("review", 5) +``` \ No newline at end of file From c0672ba176fa533632de376e6ccee64d015307da Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Thu, 3 Oct 2024 16:45:43 -0500 Subject: [PATCH 20/57] Updates readme --- python/README.md | 58 ++++++++++++++++++++++++++++++++++++++++--- python/README.qmd | 14 +++++++++++ python/mall/polars.py | 3 +-- 3 files changed, 69 insertions(+), 6 deletions(-) diff --git a/python/README.md b/python/README.md index 3b8e897..811c94d 100644 --- a/python/README.md +++ b/python/README.md @@ -38,7 +38,7 @@ reviews.llm.sentiment("review") | str | str | | "This has been the best TV I've… | "positive" | | "I regret buying this laptop. I… | "negative" | -| "Not sure how to feel about my … | "negative" | +| "Not sure how to feel about my … | "neutral" | @@ -58,8 +58,58 @@ reviews.llm.summarize("review", 5) | review | sentiment | summary | |----|----|----| | str | str | str | -| "This has been the best TV I've… | "positive" | "best tv experience ever" | -| "I regret buying this laptop. I… | "negative" | "laptop purchase was a mistake" | -| "Not sure how to feel about my … | "negative" | "neutral about the washing mach… | +| "This has been the best TV I've… | "positive" | "great tv with excellent featur… | +| "I regret buying this laptop. I… | "negative" | "bad purchase decision made her… | +| "Not sure how to feel about my … | "neutral" | " unsure about my new washer" | + + + +``` python +reviews.llm.use(options = dict(seed = 100)) +``` + + {'backend': 'ollama', 'model': 'llama3.2', 'options': {'seed': 100}} + +``` python +reviews.llm.summarize("review", 5) +``` + +
+shape: (3, 3) + +| review | sentiment | summary | +|----|----|----| +| str | str | str | +| "This has been the best TV I've… | "positive" | "it's a great tv" | +| "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | +| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | + +
+ +``` python +reviews.llm.summarize("review", 5) +``` + +
+shape: (3, 3) + +| review | sentiment | summary | +|----|----|----| +| str | str | str | +| "This has been the best TV I've… | "positive" | "it's a great tv" | +| "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | +| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… |
diff --git a/python/README.qmd b/python/README.qmd index 4fb5db3..d30cfa4 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -24,10 +24,24 @@ reviews = pl.DataFrame( ) ``` + ```{python} reviews.llm.sentiment("review") ``` + +```{python} +reviews.llm.summarize("review", 5) +``` + +```{python} +reviews.llm.use(options = dict(seed = 100)) +``` + +```{python} +reviews.llm.summarize("review", 5) +``` + ```{python} reviews.llm.summarize("review", 5) ``` \ No newline at end of file diff --git a/python/mall/polars.py b/python/mall/polars.py index cab0a8d..1dbd82e 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -14,8 +14,7 @@ def use(self, backend="", model="", **kwargs): if model != "": self._use = {"backend": self._use["backend"], "model": model} self._use.update(dict(kwargs)) - print(self._use) - return self._df + return self._use def sentiment( self, From 78dd1f26668dd92271d4f4a56ddb7b51a152a420 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Fri, 4 Oct 2024 08:44:32 -0500 Subject: [PATCH 21/57] Starts translate --- python/mall/polars.py | 25 ++++++++++++++++++++++--- python/mall/prompt.py | 17 ++++++++++++++++- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/python/mall/polars.py b/python/mall/polars.py index 1dbd82e..0a85fd1 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -1,6 +1,7 @@ import polars as pl -from mall.prompt import sentiment, summarize -from mall.llm import llm_call +from mall.prompt import sentiment, summarize, translate +from mall.llm import llm_call + @pl.api.register_dataframe_namespace("llm") class MallFrame: @@ -50,4 +51,22 @@ def summarize( ) .alias(pred_name) ) - return self._df + return self._df + + def translate( + self, + col, + language="", + additional="", + pred_name="translation", + ) -> list[pl.DataFrame]: + msg = translate(language, additional=additional) + self._df = self._df.with_columns( + pl.col(col) + .map_elements( + lambda x: llm_call(x, msg, self._use), + return_dtype=pl.String, + ) + .alias(pred_name) + ) + return self._df diff --git a/python/mall/prompt.py b/python/mall/prompt.py index 4cee522..70e7f14 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -34,9 +34,24 @@ def summarize(max_words, additional=""): "role": "user", "content": "You are a helpful summarization engine." + "Your answer will contain no no capitalization and no explanations." - + f"Return no more than " + str(max_words) + " words." + + f"Return no more than " + + str(max_words) + + " words." + f"{additional}" + "The answer is the summary of the following text:\n{}", } ] return msg + + +def translate(language, additional=""): + msg = [ + { + "You are a helpful translation engine." + + "You will return only the translation text, no explanations." + + f"The target language to translate to is: {language}." + + f"{additional}" + + "The answer is the translation of the following text:\n{}" + } + ] + return msg From ab21f78e1fdc1d644993c98a6e10468cc6ba2fc7 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Fri, 4 Oct 2024 10:40:36 -0500 Subject: [PATCH 22/57] Updates README with new print override --- python/README.md | 42 +++--------------------------------------- python/README.qmd | 21 +++++++++++++++++++++ python/mall/llm.py | 15 ++++++++------- python/uv.lock | 2 +- 4 files changed, 33 insertions(+), 47 deletions(-) diff --git a/python/README.md b/python/README.md index 811c94d..073dd2d 100644 --- a/python/README.md +++ b/python/README.md @@ -24,13 +24,6 @@ reviews = pl.DataFrame( reviews.llm.sentiment("review") ``` -
shape: (3, 2) | review | sentiment | @@ -40,29 +33,18 @@ reviews.llm.sentiment("review") | "I regret buying this laptop. I… | "negative" | | "Not sure how to feel about my … | "neutral" | -
- ``` python reviews.llm.summarize("review", 5) ``` -
shape: (3, 3) | review | sentiment | summary | |----|----|----| | str | str | str | -| "This has been the best TV I've… | "positive" | "great tv with excellent featur… | -| "I regret buying this laptop. I… | "negative" | "bad purchase decision made her… | -| "Not sure how to feel about my … | "neutral" | " unsure about my new washer" | - -
+| "This has been the best TV I've… | "positive" | "very happy with this tv" | +| "I regret buying this laptop. I… | "negative" | "laptop not meeting expectation… | +| "Not sure how to feel about my … | "neutral" | "unsure about my new washer" | ``` python reviews.llm.use(options = dict(seed = 100)) @@ -74,13 +56,6 @@ reviews.llm.use(options = dict(seed = 100)) reviews.llm.summarize("review", 5) ``` -
shape: (3, 3) | review | sentiment | summary | @@ -90,19 +65,10 @@ reviews.llm.summarize("review", 5) | "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | | "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | -
- ``` python reviews.llm.summarize("review", 5) ``` -
shape: (3, 3) | review | sentiment | summary | @@ -111,5 +77,3 @@ reviews.llm.summarize("review", 5) | "This has been the best TV I've… | "positive" | "it's a great tv" | | "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | | "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | - -
diff --git a/python/README.qmd b/python/README.qmd index d30cfa4..ad891f2 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -10,6 +10,27 @@ format: gfm pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python" ``` +```{python} +#| include: false + +import polars as pl +from polars.dataframe._html import HTMLFormatter + + +# demo of formatting w/o style tags ---- + +HTMLFormatter(pl.DataFrame({"x": [1]})).render() + + +# override default jupyter repr html with new formatter ---- + +html_formatter = get_ipython().display_formatter.formatters['text/html'] +html_formatter.for_type(pl.DataFrame, lambda df: "\n".join(HTMLFormatter(df).render())) + +# now html repr automatically uses the lambda above +pl.DataFrame({"x": [1]}) +``` + ```{python} import mall import polars as pl diff --git a/python/mall/llm.py b/python/mall/llm.py index 4b3a6ee..21ac84d 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -8,13 +8,14 @@ def build_msg(x, msg): return out -def llm_call(x, msg, use): - if use.get("backend"): - # print(dict( - # model=use.get("model"), - # messages=build_msg(x, msg), - # options=use.get("options"), - # )) +def llm_call(x, msg, use, preview = False): + if use.get("backend") == "ollama": + if preview: + print(dict( + model=use.get("model"), + messages=build_msg(x, msg), + options=use.get("options"), + )) resp = ollama.chat( model=use.get("model"), messages=build_msg(x, msg), diff --git a/python/uv.lock b/python/uv.lock index e752db4..b4ad052 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -82,7 +82,7 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "ollama", specifier = ">=0.3.3" }, - { name = "polars" }, + { name = "polars", specifier = ">=1.9.0" }, ] [[package]] From ea70715337ed086f0127014b759471fc3282dd01 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Fri, 4 Oct 2024 11:15:02 -0500 Subject: [PATCH 23/57] Fixes translation, updates README --- python/README.md | 25 +++++++++++++++++++------ python/README.qmd | 4 ++++ python/mall/llm.py | 28 ++++++++++++++-------------- python/mall/prompt.py | 5 +++-- 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/python/README.md b/python/README.md index 073dd2d..a0f87b5 100644 --- a/python/README.md +++ b/python/README.md @@ -31,7 +31,7 @@ reviews.llm.sentiment("review") | str | str | | "This has been the best TV I've… | "positive" | | "I regret buying this laptop. I… | "negative" | -| "Not sure how to feel about my … | "neutral" | +| "Not sure how to feel about my … | "negative" | ``` python reviews.llm.summarize("review", 5) @@ -42,9 +42,9 @@ reviews.llm.summarize("review", 5) | review | sentiment | summary | |----|----|----| | str | str | str | -| "This has been the best TV I've… | "positive" | "very happy with this tv" | -| "I regret buying this laptop. I… | "negative" | "laptop not meeting expectation… | -| "Not sure how to feel about my … | "neutral" | "unsure about my new washer" | +| "This has been the best TV I've… | "positive" | "best tv ever purchased" | +| "I regret buying this laptop. I… | "negative" | "laptop not living up expectati… | +| "Not sure how to feel about my … | "negative" | "having mixed feelings about it" | ``` python reviews.llm.use(options = dict(seed = 100)) @@ -63,7 +63,7 @@ reviews.llm.summarize("review", 5) | str | str | str | | "This has been the best TV I've… | "positive" | "it's a great tv" | | "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | -| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | +| "Not sure how to feel about my … | "negative" | "feeling uncertain about new pu… | ``` python reviews.llm.summarize("review", 5) @@ -76,4 +76,17 @@ reviews.llm.summarize("review", 5) | str | str | str | | "This has been the best TV I've… | "positive" | "it's a great tv" | | "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | -| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | +| "Not sure how to feel about my … | "negative" | "feeling uncertain about new pu… | + +``` python +reviews.llm.translate("review", "spanish") +``` + +shape: (3, 4) + +| review | sentiment | summary | translation | +|----|----|----|----| +| str | str | str | str | +| "This has been the best TV I've… | "positive" | "it's a great tv" | "Esta ha sido la mejor TV que h… | +| "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | "Lo lamento comprar este portát… | +| "Not sure how to feel about my … | "negative" | "feeling uncertain about new pu… | "No estoy seguro de cómo sentir… | diff --git a/python/README.qmd b/python/README.qmd index ad891f2..dd3f9e5 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -65,4 +65,8 @@ reviews.llm.summarize("review", 5) ```{python} reviews.llm.summarize("review", 5) +``` + +```{python} +reviews.llm.translate("review", "spanish") ``` \ No newline at end of file diff --git a/python/mall/llm.py b/python/mall/llm.py index 21ac84d..a84d23c 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -8,18 +8,18 @@ def build_msg(x, msg): return out -def llm_call(x, msg, use, preview = False): - if use.get("backend") == "ollama": - if preview: - print(dict( - model=use.get("model"), - messages=build_msg(x, msg), - options=use.get("options"), - )) - resp = ollama.chat( - model=use.get("model"), - messages=build_msg(x, msg), - options=use.get("options"), - ) - out = resp["message"]["content"] +def llm_call(x, msg, use, preview=True): + # print( + # dict( + # model=use.get("model"), + # messages=build_msg(x, msg), + # options=use.get("options"), + # ) + # ) + resp = ollama.chat( + model=use.get("model"), + messages=build_msg(x, msg), + options=use.get("options"), + ) + out = resp["message"]["content"] return out diff --git a/python/mall/prompt.py b/python/mall/prompt.py index 70e7f14..f98e62f 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -47,11 +47,12 @@ def summarize(max_words, additional=""): def translate(language, additional=""): msg = [ { - "You are a helpful translation engine." + "role": "user", + "content": "You are a helpful translation engine." + "You will return only the translation text, no explanations." + f"The target language to translate to is: {language}." + f"{additional}" - + "The answer is the translation of the following text:\n{}" + + "The answer is the translation of the following text:\n{}", } ] return msg From 120e0b2f2988ab6ecf649c13ed5bd9d07623e118 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Fri, 4 Oct 2024 11:54:36 -0500 Subject: [PATCH 24/57] Centralizes mapping call --- python/README.md | 12 +++++----- python/mall/polars.py | 53 +++++++++++++++++++++++-------------------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/python/README.md b/python/README.md index a0f87b5..e3c632a 100644 --- a/python/README.md +++ b/python/README.md @@ -31,7 +31,7 @@ reviews.llm.sentiment("review") | str | str | | "This has been the best TV I've… | "positive" | | "I regret buying this laptop. I… | "negative" | -| "Not sure how to feel about my … | "negative" | +| "Not sure how to feel about my … | "neutral" | ``` python reviews.llm.summarize("review", 5) @@ -42,9 +42,9 @@ reviews.llm.summarize("review", 5) | review | sentiment | summary | |----|----|----| | str | str | str | -| "This has been the best TV I've… | "positive" | "best tv ever purchased" | +| "This has been the best TV I've… | "positive" | "best tv i've ever owned" | | "I regret buying this laptop. I… | "negative" | "laptop not living up expectati… | -| "Not sure how to feel about my … | "negative" | "having mixed feelings about it" | +| "Not sure how to feel about my … | "neutral" | " unsure about the purchase" | ``` python reviews.llm.use(options = dict(seed = 100)) @@ -63,7 +63,7 @@ reviews.llm.summarize("review", 5) | str | str | str | | "This has been the best TV I've… | "positive" | "it's a great tv" | | "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | -| "Not sure how to feel about my … | "negative" | "feeling uncertain about new pu… | +| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | ``` python reviews.llm.summarize("review", 5) @@ -76,7 +76,7 @@ reviews.llm.summarize("review", 5) | str | str | str | | "This has been the best TV I've… | "positive" | "it's a great tv" | | "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | -| "Not sure how to feel about my … | "negative" | "feeling uncertain about new pu… | +| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | ``` python reviews.llm.translate("review", "spanish") @@ -89,4 +89,4 @@ reviews.llm.translate("review", "spanish") | str | str | str | str | | "This has been the best TV I've… | "positive" | "it's a great tv" | "Esta ha sido la mejor TV que h… | | "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | "Lo lamento comprar este portát… | -| "Not sure how to feel about my … | "negative" | "feeling uncertain about new pu… | "No estoy seguro de cómo sentir… | +| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | "No estoy seguro de cómo sentir… | diff --git a/python/mall/polars.py b/python/mall/polars.py index 0a85fd1..f3816ab 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -24,15 +24,13 @@ def sentiment( additional="", pred_name="sentiment", ) -> list[pl.DataFrame]: - msg = sentiment(options, additional=additional) - self._df = self._df.with_columns( - pl.col(col) - .map_elements( - lambda x: llm_call(x, msg, self._use), - return_dtype=pl.String, + self._df = map_call( + df = self._df, + col= col, + msg=sentiment(options, additional=additional), + pred_name= pred_name, + use=self._use ) - .alias(pred_name) - ) return self._df def summarize( @@ -42,15 +40,13 @@ def summarize( additional="", pred_name="summary", ) -> list[pl.DataFrame]: - msg = summarize(max_words, additional=additional) - self._df = self._df.with_columns( - pl.col(col) - .map_elements( - lambda x: llm_call(x, msg, self._use), - return_dtype=pl.String, + self._df = map_call( + df = self._df, + col= col, + msg=summarize(max_words, additional=additional), + pred_name= pred_name, + use=self._use ) - .alias(pred_name) - ) return self._df def translate( @@ -60,13 +56,22 @@ def translate( additional="", pred_name="translation", ) -> list[pl.DataFrame]: - msg = translate(language, additional=additional) - self._df = self._df.with_columns( - pl.col(col) - .map_elements( - lambda x: llm_call(x, msg, self._use), - return_dtype=pl.String, + self._df = map_call( + df = self._df, + col= col, + msg=translate(language, additional=additional), + pred_name= pred_name, + use=self._use ) - .alias(pred_name) - ) return self._df + +def map_call(df, col, msg, pred_name, use): + df = df.with_columns( + pl.col(col) + .map_elements( + lambda x: llm_call(x, msg, use), + return_dtype=pl.String, + ) + .alias(pred_name) + ) + return df From 5260c6c7cb06d2942d993857fc7c7228a52e81f1 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Fri, 4 Oct 2024 12:11:12 -0500 Subject: [PATCH 25/57] Add classify, updates README --- python/README.md | 32 +++++++++++++++++++++++++++++--- python/README.qmd | 15 ++++++++++++++- python/mall/polars.py | 18 +++++++++++++++++- python/mall/prompt.py | 19 +++++++++++++++++++ 4 files changed, 79 insertions(+), 5 deletions(-) diff --git a/python/README.md b/python/README.md index e3c632a..4456a75 100644 --- a/python/README.md +++ b/python/README.md @@ -42,9 +42,9 @@ reviews.llm.summarize("review", 5) | review | sentiment | summary | |----|----|----| | str | str | str | -| "This has been the best TV I've… | "positive" | "best tv i've ever owned" | -| "I regret buying this laptop. I… | "negative" | "laptop not living up expectati… | -| "Not sure how to feel about my … | "neutral" | " unsure about the purchase" | +| "This has been the best TV I've… | "positive" | "it's a great tv" | +| "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | +| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | ``` python reviews.llm.use(options = dict(seed = 100)) @@ -90,3 +90,29 @@ reviews.llm.translate("review", "spanish") | "This has been the best TV I've… | "positive" | "it's a great tv" | "Esta ha sido la mejor TV que h… | | "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | "Lo lamento comprar este portát… | | "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | "No estoy seguro de cómo sentir… | + +``` python +reviews.llm.translate("review", "spanish") +``` + +shape: (3, 4) + +| review | sentiment | summary | translation | +|----|----|----|----| +| str | str | str | str | +| "This has been the best TV I've… | "positive" | "it's a great tv" | "Esta ha sido la mejor TV que h… | +| "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | "Lo lamento comprar este portát… | +| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | "No estoy seguro de cómo sentir… | + +``` python +reviews.llm.classify("review", ["computer", "appliance"]) +``` + +shape: (3, 5) + +| review | sentiment | summary | translation | classify | +|----|----|----|----|----| +| str | str | str | str | str | +| "This has been the best TV I've… | "positive" | "it's a great tv" | "Esta ha sido la mejor TV que h… | "appliance" | +| "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | "Lo lamento comprar este portát… | "appliance" | +| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | "No estoy seguro de cómo sentir… | "appliance" | diff --git a/python/README.qmd b/python/README.qmd index dd3f9e5..c3dd383 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -31,6 +31,7 @@ html_formatter.for_type(pl.DataFrame, lambda df: "\n".join(HTMLFormatter(df).ren pl.DataFrame({"x": [1]}) ``` + ```{python} import mall import polars as pl @@ -45,6 +46,10 @@ reviews = pl.DataFrame( ) ``` +```{python} +#| include: false +reviews.llm.use(options = dict(seed = 100)) +``` ```{python} reviews.llm.sentiment("review") @@ -69,4 +74,12 @@ reviews.llm.summarize("review", 5) ```{python} reviews.llm.translate("review", "spanish") -``` \ No newline at end of file +``` + +```{python} +reviews.llm.translate("review", "spanish") +``` + +```{python} +reviews.llm.classify("review", ["computer", "appliance"]) +``` diff --git a/python/mall/polars.py b/python/mall/polars.py index f3816ab..f6054f9 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -1,5 +1,5 @@ import polars as pl -from mall.prompt import sentiment, summarize, translate +from mall.prompt import sentiment, summarize, translate, classify from mall.llm import llm_call @@ -65,6 +65,22 @@ def translate( ) return self._df + def classify( + self, + col, + labels="", + additional="", + pred_name="classify", + ) -> list[pl.DataFrame]: + self._df = map_call( + df = self._df, + col= col, + msg=classify(labels, additional=additional), + pred_name= pred_name, + use=self._use + ) + return self._df + def map_call(df, col, msg, pred_name, use): df = df.with_columns( pl.col(col) diff --git a/python/mall/prompt.py b/python/mall/prompt.py index f98e62f..afa987b 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -56,3 +56,22 @@ def translate(language, additional=""): } ] return msg + + +def classify(labels, additional=""): + labels = process_labels( + labels, + "Determine if the text refers to one of the following:{values}", + "- If the text is {key}, return {value}", + ) + msg = [ + { + "role": "user", + "content": "You are a helpful classification engine. " + + f"{labels}. " + + "No capitalization. No explanations. " + + f"{additional} " + + "The answer is based on the following text:\n{}", + } + ] + return msg From a73d03bf7855b013ec8dc257c6ba8d5da4f20855 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Fri, 4 Oct 2024 12:19:21 -0500 Subject: [PATCH 26/57] Switches to not modifying the source df --- python/README.md | 84 +++++++++++++++++++++---------------------- python/mall/polars.py | 57 ++++++++++++++--------------- 2 files changed, 71 insertions(+), 70 deletions(-) diff --git a/python/README.md b/python/README.md index 4456a75..af27335 100644 --- a/python/README.md +++ b/python/README.md @@ -37,14 +37,14 @@ reviews.llm.sentiment("review") reviews.llm.summarize("review", 5) ``` -shape: (3, 3) +shape: (3, 2) -| review | sentiment | summary | -|----|----|----| -| str | str | str | -| "This has been the best TV I've… | "positive" | "it's a great tv" | -| "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | -| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | +| review | summary | +|----------------------------------|----------------------------------| +| str | str | +| "This has been the best TV I've… | "it's a great tv" | +| "I regret buying this laptop. I… | "laptop not worth the money" | +| "Not sure how to feel about my … | "feeling uncertain about new pu… | ``` python reviews.llm.use(options = dict(seed = 100)) @@ -56,63 +56,63 @@ reviews.llm.use(options = dict(seed = 100)) reviews.llm.summarize("review", 5) ``` -shape: (3, 3) +shape: (3, 2) -| review | sentiment | summary | -|----|----|----| -| str | str | str | -| "This has been the best TV I've… | "positive" | "it's a great tv" | -| "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | -| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | +| review | summary | +|----------------------------------|----------------------------------| +| str | str | +| "This has been the best TV I've… | "it's a great tv" | +| "I regret buying this laptop. I… | "laptop not worth the money" | +| "Not sure how to feel about my … | "feeling uncertain about new pu… | ``` python reviews.llm.summarize("review", 5) ``` -shape: (3, 3) +shape: (3, 2) -| review | sentiment | summary | -|----|----|----| -| str | str | str | -| "This has been the best TV I've… | "positive" | "it's a great tv" | -| "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | -| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | +| review | summary | +|----------------------------------|----------------------------------| +| str | str | +| "This has been the best TV I've… | "it's a great tv" | +| "I regret buying this laptop. I… | "laptop not worth the money" | +| "Not sure how to feel about my … | "feeling uncertain about new pu… | ``` python reviews.llm.translate("review", "spanish") ``` -shape: (3, 4) +shape: (3, 2) -| review | sentiment | summary | translation | -|----|----|----|----| -| str | str | str | str | -| "This has been the best TV I've… | "positive" | "it's a great tv" | "Esta ha sido la mejor TV que h… | -| "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | "Lo lamento comprar este portát… | -| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | "No estoy seguro de cómo sentir… | +| review | translation | +|----------------------------------|----------------------------------| +| str | str | +| "This has been the best TV I've… | "Esta ha sido la mejor TV que h… | +| "I regret buying this laptop. I… | "Lo lamento comprar este portát… | +| "Not sure how to feel about my … | "No estoy seguro de cómo sentir… | ``` python reviews.llm.translate("review", "spanish") ``` -shape: (3, 4) +shape: (3, 2) -| review | sentiment | summary | translation | -|----|----|----|----| -| str | str | str | str | -| "This has been the best TV I've… | "positive" | "it's a great tv" | "Esta ha sido la mejor TV que h… | -| "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | "Lo lamento comprar este portát… | -| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | "No estoy seguro de cómo sentir… | +| review | translation | +|----------------------------------|----------------------------------| +| str | str | +| "This has been the best TV I've… | "Esta ha sido la mejor TV que h… | +| "I regret buying this laptop. I… | "Lo lamento comprar este portát… | +| "Not sure how to feel about my … | "No estoy seguro de cómo sentir… | ``` python reviews.llm.classify("review", ["computer", "appliance"]) ``` -shape: (3, 5) +shape: (3, 2) -| review | sentiment | summary | translation | classify | -|----|----|----|----|----| -| str | str | str | str | str | -| "This has been the best TV I've… | "positive" | "it's a great tv" | "Esta ha sido la mejor TV que h… | "appliance" | -| "I regret buying this laptop. I… | "negative" | "laptop not worth the money" | "Lo lamento comprar este portát… | "appliance" | -| "Not sure how to feel about my … | "neutral" | "feeling uncertain about new pu… | "No estoy seguro de cómo sentir… | "appliance" | +| review | classify | +|----------------------------------|-------------| +| str | str | +| "This has been the best TV I've… | "appliance" | +| "I regret buying this laptop. I… | "appliance" | +| "Not sure how to feel about my … | "appliance" | diff --git a/python/mall/polars.py b/python/mall/polars.py index f6054f9..3407aeb 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -24,14 +24,14 @@ def sentiment( additional="", pred_name="sentiment", ) -> list[pl.DataFrame]: - self._df = map_call( - df = self._df, - col= col, + df = map_call( + df=self._df, + col=col, msg=sentiment(options, additional=additional), - pred_name= pred_name, - use=self._use - ) - return self._df + pred_name=pred_name, + use=self._use, + ) + return df def summarize( self, @@ -40,14 +40,14 @@ def summarize( additional="", pred_name="summary", ) -> list[pl.DataFrame]: - self._df = map_call( - df = self._df, - col= col, + df = map_call( + df=self._df, + col=col, msg=summarize(max_words, additional=additional), - pred_name= pred_name, - use=self._use - ) - return self._df + pred_name=pred_name, + use=self._use, + ) + return df def translate( self, @@ -56,14 +56,14 @@ def translate( additional="", pred_name="translation", ) -> list[pl.DataFrame]: - self._df = map_call( - df = self._df, - col= col, + df = map_call( + df=self._df, + col=col, msg=translate(language, additional=additional), - pred_name= pred_name, - use=self._use - ) - return self._df + pred_name=pred_name, + use=self._use, + ) + return df def classify( self, @@ -72,14 +72,15 @@ def classify( additional="", pred_name="classify", ) -> list[pl.DataFrame]: - self._df = map_call( - df = self._df, - col= col, + df = map_call( + df=self._df, + col=col, msg=classify(labels, additional=additional), - pred_name= pred_name, - use=self._use - ) - return self._df + pred_name=pred_name, + use=self._use, + ) + return df + def map_call(df, col, msg, pred_name, use): df = df.with_columns( From b06597b32f0426c7f97ee2d2f071426416864e64 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Fri, 4 Oct 2024 12:48:52 -0500 Subject: [PATCH 27/57] Adds subtitles to README --- python/README.md | 54 ++++++++++--------------------------------- python/README.qmd | 36 ++++++++--------------------- python/mall/prompt.py | 4 ++-- 3 files changed, 23 insertions(+), 71 deletions(-) diff --git a/python/README.md b/python/README.md index af27335..b35aefa 100644 --- a/python/README.md +++ b/python/README.md @@ -1,3 +1,4 @@ +# Sentiment # mall @@ -33,24 +34,7 @@ reviews.llm.sentiment("review") | "I regret buying this laptop. I… | "negative" | | "Not sure how to feel about my … | "neutral" | -``` python -reviews.llm.summarize("review", 5) -``` - -shape: (3, 2) - -| review | summary | -|----------------------------------|----------------------------------| -| str | str | -| "This has been the best TV I've… | "it's a great tv" | -| "I regret buying this laptop. I… | "laptop not worth the money" | -| "Not sure how to feel about my … | "feeling uncertain about new pu… | - -``` python -reviews.llm.use(options = dict(seed = 100)) -``` - - {'backend': 'ollama', 'model': 'llama3.2', 'options': {'seed': 100}} +## Summarize ``` python reviews.llm.summarize("review", 5) @@ -65,18 +49,7 @@ reviews.llm.summarize("review", 5) | "I regret buying this laptop. I… | "laptop not worth the money" | | "Not sure how to feel about my … | "feeling uncertain about new pu… | -``` python -reviews.llm.summarize("review", 5) -``` - -shape: (3, 2) - -| review | summary | -|----------------------------------|----------------------------------| -| str | str | -| "This has been the best TV I've… | "it's a great tv" | -| "I regret buying this laptop. I… | "laptop not worth the money" | -| "Not sure how to feel about my … | "feeling uncertain about new pu… | +## Translate (as in ‘English to French’) ``` python reviews.llm.translate("review", "spanish") @@ -91,18 +64,7 @@ reviews.llm.translate("review", "spanish") | "I regret buying this laptop. I… | "Lo lamento comprar este portát… | | "Not sure how to feel about my … | "No estoy seguro de cómo sentir… | -``` python -reviews.llm.translate("review", "spanish") -``` - -shape: (3, 2) - -| review | translation | -|----------------------------------|----------------------------------| -| str | str | -| "This has been the best TV I've… | "Esta ha sido la mejor TV que h… | -| "I regret buying this laptop. I… | "Lo lamento comprar este portát… | -| "Not sure how to feel about my … | "No estoy seguro de cómo sentir… | +## Classify ``` python reviews.llm.classify("review", ["computer", "appliance"]) @@ -116,3 +78,11 @@ reviews.llm.classify("review", ["computer", "appliance"]) | "This has been the best TV I've… | "appliance" | | "I regret buying this laptop. I… | "appliance" | | "Not sure how to feel about my … | "appliance" | + +## LLM session setup + +``` python +reviews.llm.use(options = dict(seed = 100)) +``` + + {'backend': 'ollama', 'model': 'llama3.2', 'options': {'seed': 100}} diff --git a/python/README.qmd b/python/README.qmd index c3dd383..1342449 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -4,31 +4,16 @@ format: gfm # mall - - ```python pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python" ``` ```{python} #| include: false - import polars as pl from polars.dataframe._html import HTMLFormatter - - -# demo of formatting w/o style tags ---- - -HTMLFormatter(pl.DataFrame({"x": [1]})).render() - - -# override default jupyter repr html with new formatter ---- - html_formatter = get_ipython().display_formatter.formatters['text/html'] html_formatter.for_type(pl.DataFrame, lambda df: "\n".join(HTMLFormatter(df).render())) - -# now html repr automatically uses the lambda above -pl.DataFrame({"x": [1]}) ``` @@ -51,35 +36,32 @@ reviews = pl.DataFrame( reviews.llm.use(options = dict(seed = 100)) ``` +## Sentiment + ```{python} reviews.llm.sentiment("review") ``` +## Summarize ```{python} reviews.llm.summarize("review", 5) ``` -```{python} -reviews.llm.use(options = dict(seed = 100)) -``` +## Translate (as in 'English to French') ```{python} -reviews.llm.summarize("review", 5) +reviews.llm.translate("review", "spanish") ``` -```{python} -reviews.llm.summarize("review", 5) -``` +## Classify ```{python} -reviews.llm.translate("review", "spanish") +reviews.llm.classify("review", ["computer", "appliance"]) ``` -```{python} -reviews.llm.translate("review", "spanish") -``` +## LLM session setup ```{python} -reviews.llm.classify("review", ["computer", "appliance"]) +reviews.llm.use(options = dict(seed = 100)) ``` diff --git a/python/mall/prompt.py b/python/mall/prompt.py index afa987b..878fc0a 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -59,7 +59,7 @@ def translate(language, additional=""): def classify(labels, additional=""): - labels = process_labels( + new_labels = process_labels( labels, "Determine if the text refers to one of the following:{values}", "- If the text is {key}, return {value}", @@ -68,7 +68,7 @@ def classify(labels, additional=""): { "role": "user", "content": "You are a helpful classification engine. " - + f"{labels}. " + + f"{new_labels}. " + "No capitalization. No explanations. " + f"{additional} " + "The answer is based on the following text:\n{}", From 6a4ff3da7932b35a26aeedc0dd29b2fdfdd6b2b1 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Fri, 4 Oct 2024 12:52:02 -0500 Subject: [PATCH 28/57] Updates README --- python/README.md | 16 ++++++++++++++-- python/README.qmd | 11 +++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/python/README.md b/python/README.md index b35aefa..18194a4 100644 --- a/python/README.md +++ b/python/README.md @@ -1,12 +1,21 @@ -# Sentiment +# mall +## Intro -# mall +Run multiple LLM predictions against a data frame. The predictions are +processed row-wise over a specified column. It works using a +pre-determined one-shot prompt, along with the current row’s content. + +## Install + +To install from Github, use: ``` python pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python" ``` +## Examples + ``` python import mall import polars as pl @@ -21,6 +30,9 @@ reviews = pl.DataFrame( ) ``` +## Sentiment + + ``` python reviews.llm.sentiment("review") ``` diff --git a/python/README.qmd b/python/README.qmd index 1342449..af8b3ed 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -4,10 +4,20 @@ format: gfm # mall +## Intro + +Run multiple LLM predictions against a data frame. The predictions are processed row-wise over a specified column. It works using a pre-determined one-shot prompt, along with the current row’s content. + +## Install + +To install from Github, use: + ```python pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python" ``` +## Examples + ```{python} #| include: false import polars as pl @@ -36,6 +46,7 @@ reviews = pl.DataFrame( reviews.llm.use(options = dict(seed = 100)) ``` + ## Sentiment ```{python} From 1493eb9f98d49f0ef63817d4295f17bd39564e5c Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Fri, 4 Oct 2024 13:27:38 -0500 Subject: [PATCH 29/57] Starts adding some documentation --- python/mall/polars.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/python/mall/polars.py b/python/mall/polars.py index 3407aeb..359c9c6 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -5,6 +5,9 @@ @pl.api.register_dataframe_namespace("llm") class MallFrame: + """Extension to Polars that add ability to use + an LLM to run batch predictions over a data frame + """ def __init__(self, df: pl.DataFrame) -> None: self._df = df self._use = {"backend": "ollama", "model": "llama3.2"} @@ -56,6 +59,23 @@ def translate( additional="", pred_name="translation", ) -> list[pl.DataFrame]: + """Translate text into another language. + + Parameters + ------ + col + The name of the text field to process + + language + The target language to translate to. For example 'French'. + + pred_name + A character vector with the name of the new column where the + prediction will be placed + + additional + Inserts this text into the prompt sent to the LLM + """ df = map_call( df=self._df, col=col, From 8d7c3fd12549767962bdd44d4dedae72cf5c951c Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Fri, 4 Oct 2024 14:03:32 -0500 Subject: [PATCH 30/57] Starts moving files around to build unified site --- _quarto.yml | 4 +- {r/articles => articles}/caching.qmd | 0 {r/articles => articles}/databricks.qmd | 0 .../favicon/apple-touch-icon-120x120.png | Bin .../favicon/apple-touch-icon-152x152.png | Bin .../favicon/apple-touch-icon-180x180.png | Bin .../favicon/apple-touch-icon-60x60.png | Bin .../favicon/apple-touch-icon-76x76.png | Bin .../favicon/apple-touch-icon.png | Bin .../favicon/favicon-16x16.png | Bin .../favicon/favicon-32x32.png | Bin .../figures => figures}/favicon/favicon.ico | Bin {r/man/figures => figures}/logo.png | Bin {r/man/figures => figures}/mall.png | Bin index.qmd | 68 ++++++++++++++++++ r/.Rbuildignore | 2 + r/index.qmd | 7 -- {r/reference => reference}/index.qmd | 0 {r/reference => reference}/llm_classify.qmd | 0 {r/reference => reference}/llm_custom.qmd | 0 {r/reference => reference}/llm_extract.qmd | 0 {r/reference => reference}/llm_sentiment.qmd | 0 {r/reference => reference}/llm_summarize.qmd | 0 {r/reference => reference}/llm_translate.qmd | 0 {r/reference => reference}/llm_use.qmd | 0 .../m_backend_submit.qmd | 0 {r/reference => reference}/reviews.qmd | 0 27 files changed, 72 insertions(+), 9 deletions(-) rename {r/articles => articles}/caching.qmd (100%) rename {r/articles => articles}/databricks.qmd (100%) rename {r/man/figures => figures}/favicon/apple-touch-icon-120x120.png (100%) rename {r/man/figures => figures}/favicon/apple-touch-icon-152x152.png (100%) rename {r/man/figures => figures}/favicon/apple-touch-icon-180x180.png (100%) rename {r/man/figures => figures}/favicon/apple-touch-icon-60x60.png (100%) rename {r/man/figures => figures}/favicon/apple-touch-icon-76x76.png (100%) rename {r/man/figures => figures}/favicon/apple-touch-icon.png (100%) rename {r/man/figures => figures}/favicon/favicon-16x16.png (100%) rename {r/man/figures => figures}/favicon/favicon-32x32.png (100%) rename {r/man/figures => figures}/favicon/favicon.ico (100%) rename {r/man/figures => figures}/logo.png (100%) rename {r/man/figures => figures}/mall.png (100%) create mode 100644 index.qmd create mode 100644 r/.Rbuildignore delete mode 100644 r/index.qmd rename {r/reference => reference}/index.qmd (100%) rename {r/reference => reference}/llm_classify.qmd (100%) rename {r/reference => reference}/llm_custom.qmd (100%) rename {r/reference => reference}/llm_extract.qmd (100%) rename {r/reference => reference}/llm_sentiment.qmd (100%) rename {r/reference => reference}/llm_summarize.qmd (100%) rename {r/reference => reference}/llm_translate.qmd (100%) rename {r/reference => reference}/llm_use.qmd (100%) rename {r/reference => reference}/m_backend_submit.qmd (100%) rename {r/reference => reference}/reviews.qmd (100%) diff --git a/_quarto.yml b/_quarto.yml index 7ed0039..3390754 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -8,9 +8,9 @@ execute: website: title: mall - favicon: "man/figures/favicon/apple-touch-icon.png" + favicon: "figures/favicon/apple-touch-icon.png" navbar: - logo: "man/figures/favicon/apple-touch-icon.png" + logo: "figures/favicon/apple-touch-icon.png" left: - sidebar:articles - href: reference/index.qmd diff --git a/r/articles/caching.qmd b/articles/caching.qmd similarity index 100% rename from r/articles/caching.qmd rename to articles/caching.qmd diff --git a/r/articles/databricks.qmd b/articles/databricks.qmd similarity index 100% rename from r/articles/databricks.qmd rename to articles/databricks.qmd diff --git a/r/man/figures/favicon/apple-touch-icon-120x120.png b/figures/favicon/apple-touch-icon-120x120.png similarity index 100% rename from r/man/figures/favicon/apple-touch-icon-120x120.png rename to figures/favicon/apple-touch-icon-120x120.png diff --git a/r/man/figures/favicon/apple-touch-icon-152x152.png b/figures/favicon/apple-touch-icon-152x152.png similarity index 100% rename from r/man/figures/favicon/apple-touch-icon-152x152.png rename to figures/favicon/apple-touch-icon-152x152.png diff --git a/r/man/figures/favicon/apple-touch-icon-180x180.png b/figures/favicon/apple-touch-icon-180x180.png similarity index 100% rename from r/man/figures/favicon/apple-touch-icon-180x180.png rename to figures/favicon/apple-touch-icon-180x180.png diff --git a/r/man/figures/favicon/apple-touch-icon-60x60.png b/figures/favicon/apple-touch-icon-60x60.png similarity index 100% rename from r/man/figures/favicon/apple-touch-icon-60x60.png rename to figures/favicon/apple-touch-icon-60x60.png diff --git a/r/man/figures/favicon/apple-touch-icon-76x76.png b/figures/favicon/apple-touch-icon-76x76.png similarity index 100% rename from r/man/figures/favicon/apple-touch-icon-76x76.png rename to figures/favicon/apple-touch-icon-76x76.png diff --git a/r/man/figures/favicon/apple-touch-icon.png b/figures/favicon/apple-touch-icon.png similarity index 100% rename from r/man/figures/favicon/apple-touch-icon.png rename to figures/favicon/apple-touch-icon.png diff --git a/r/man/figures/favicon/favicon-16x16.png b/figures/favicon/favicon-16x16.png similarity index 100% rename from r/man/figures/favicon/favicon-16x16.png rename to figures/favicon/favicon-16x16.png diff --git a/r/man/figures/favicon/favicon-32x32.png b/figures/favicon/favicon-32x32.png similarity index 100% rename from r/man/figures/favicon/favicon-32x32.png rename to figures/favicon/favicon-32x32.png diff --git a/r/man/figures/favicon/favicon.ico b/figures/favicon/favicon.ico similarity index 100% rename from r/man/figures/favicon/favicon.ico rename to figures/favicon/favicon.ico diff --git a/r/man/figures/logo.png b/figures/logo.png similarity index 100% rename from r/man/figures/logo.png rename to figures/logo.png diff --git a/r/man/figures/mall.png b/figures/mall.png similarity index 100% rename from r/man/figures/mall.png rename to figures/mall.png diff --git a/index.qmd b/index.qmd new file mode 100644 index 0000000..bb8e8aa --- /dev/null +++ b/index.qmd @@ -0,0 +1,68 @@ +--- +format: + html: + toc: true +--- + +::: {.panel-tabset group="language"} +## R + +``` {.r} +fizz_buzz <- function(fbnums = 1:50) { + output <- dplyr::case_when( + fbnums %% 15 == 0 ~ "FizzBuzz", + fbnums %% 3 == 0 ~ "Fizz", + fbnums %% 5 == 0 ~ "Buzz", + TRUE ~ as.character(fbnums) + ) + print(output) +} +``` + +## Python + +``` {.python} +def fizz_buzz(num): + if num % 15 == 0: + print("FizzBuzz") + elif num % 5 == 0: + print("Buzz") + elif num % 3 == 0: + print("Fizz") + else: + print(num) +``` + +::: + + +::: {.panel-tabset group="language"} +## R + +``` {.r} +fizz_buzz <- function(fbnums = 1:50) { + output <- dplyr::case_when( + fbnums %% 15 == 0 ~ "FizzBuzz", + fbnums %% 3 == 0 ~ "Fizz", + fbnums %% 5 == 0 ~ "Buzz", + TRUE ~ as.character(fbnums) + ) + print(output) +} +``` + +## Python + +``` {.python} +def fizz_buzz(num): + if num % 15 == 0: + print("FizzBuzz") + elif num % 5 == 0: + print("Buzz") + elif num % 3 == 0: + print("Fizz") + else: + print(num) +``` + +::: diff --git a/r/.Rbuildignore b/r/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/r/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/r/index.qmd b/r/index.qmd deleted file mode 100644 index 336d2c4..0000000 --- a/r/index.qmd +++ /dev/null @@ -1,7 +0,0 @@ ---- -format: - html: - toc: true ---- - -{{< include README.md >}} diff --git a/r/reference/index.qmd b/reference/index.qmd similarity index 100% rename from r/reference/index.qmd rename to reference/index.qmd diff --git a/r/reference/llm_classify.qmd b/reference/llm_classify.qmd similarity index 100% rename from r/reference/llm_classify.qmd rename to reference/llm_classify.qmd diff --git a/r/reference/llm_custom.qmd b/reference/llm_custom.qmd similarity index 100% rename from r/reference/llm_custom.qmd rename to reference/llm_custom.qmd diff --git a/r/reference/llm_extract.qmd b/reference/llm_extract.qmd similarity index 100% rename from r/reference/llm_extract.qmd rename to reference/llm_extract.qmd diff --git a/r/reference/llm_sentiment.qmd b/reference/llm_sentiment.qmd similarity index 100% rename from r/reference/llm_sentiment.qmd rename to reference/llm_sentiment.qmd diff --git a/r/reference/llm_summarize.qmd b/reference/llm_summarize.qmd similarity index 100% rename from r/reference/llm_summarize.qmd rename to reference/llm_summarize.qmd diff --git a/r/reference/llm_translate.qmd b/reference/llm_translate.qmd similarity index 100% rename from r/reference/llm_translate.qmd rename to reference/llm_translate.qmd diff --git a/r/reference/llm_use.qmd b/reference/llm_use.qmd similarity index 100% rename from r/reference/llm_use.qmd rename to reference/llm_use.qmd diff --git a/r/reference/m_backend_submit.qmd b/reference/m_backend_submit.qmd similarity index 100% rename from r/reference/m_backend_submit.qmd rename to reference/m_backend_submit.qmd diff --git a/r/reference/reviews.qmd b/reference/reviews.qmd similarity index 100% rename from r/reference/reviews.qmd rename to reference/reviews.qmd From d24b3ae34ec0cc687eb78fa685bd6d1994e73e03 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Fri, 4 Oct 2024 14:48:53 -0500 Subject: [PATCH 31/57] Moves utils folder, starts adding content from original index --- index.qmd | 384 +++++++++++++++++++ {r/utils => utils}/knitr-print.R | 0 {r/utils => utils}/website/README.md | 0 {r/utils => utils}/website/_reference.qmd | 0 {r/utils => utils}/website/build_reference.R | 0 {r/utils => utils}/website/index-page.R | 0 {r/utils => utils}/website/list-to-qmd.R | 0 {r/utils => utils}/website/rd-to-list.R | 0 8 files changed, 384 insertions(+) rename {r/utils => utils}/knitr-print.R (100%) rename {r/utils => utils}/website/README.md (100%) rename {r/utils => utils}/website/_reference.qmd (100%) rename {r/utils => utils}/website/build_reference.R (100%) rename {r/utils => utils}/website/index-page.R (100%) rename {r/utils => utils}/website/list-to-qmd.R (100%) rename {r/utils => utils}/website/rd-to-list.R (100%) diff --git a/index.qmd b/index.qmd index bb8e8aa..b5858a7 100644 --- a/index.qmd +++ b/index.qmd @@ -2,8 +2,392 @@ format: html: toc: true +execute: + eval: false + freeze: false --- + +```{r} +#| eval: true +#| include: false +library(dplyr) +library(dbplyr) +library(tictoc) +library(DBI) +source("utils/knitr-print.R") +mall::llm_use("ollama", "llama3.2", seed = 100, .cache = "_readme_cache") +``` + + + + +## Intro + +Run multiple LLM predictions against a data frame. The predictions are processed +row-wise over a specified column. It works using a pre-determined one-shot prompt, +along with the current row's content. The prompt that is use will depend of the +type of analysis needed. Currently, the included prompts perform the following: + +- [Sentiment analysis](#sentiment) +- [Text summarizing](#summarize) +- [Classify text](#classify) +- [Extract one, or several](#extract), specific pieces information from the text +- [Translate text](#translate) +- [Custom prompt](#custom-prompt) + +This package is inspired by the SQL AI functions now offered by vendors such as +[Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) +and Snowflake. `mall` uses [Ollama](https://ollama.com/) to +interact with LLMs installed locally. That interaction takes place via the +[`ollamar`](https://hauselin.github.io/ollama-r/) package. + +## Motivation + +We want to new find ways to help data scientists use LLMs in their daily work. +Unlike the familiar interfaces, such as chatting and code completion, this interface +runs your text data directly against the LLM. The LLM's flexibility, allows for +it to adapt to the subject of your data, and provide surprisingly accurate predictions. +This saves the data scientist the need to write and tune an NLP model. + +```{r} +#| include: false + +# Add paragraph about: thanks to the more widespread availability of capable +# local llms, data does not leave your company, no $$ cost to use + +``` + +## Get started + +- Install `mall` from Github + + +::: {.panel-tabset group="language"} +## R +```r +pak::pak("edgararuiz/mall/r@python") +``` + +## python +```python +pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python" +``` +::: + + +### With local LLMs + +- Install Ollama in your machine. The `ollamar` package's website provides this +[Installation guide](https://hauselin.github.io/ollama-r/#installation) + +- Download an LLM model. For example, I have been developing this package using +Llama 3.2 to test. To get that model you can run: + ```r + ollamar::pull("llama3.2") + ``` + +### With Databricks + +If you pass a table connected to **Databricks** via `odbc`, `mall` will +automatically use Databricks' LLM instead of Ollama. *You won't need Ollama +installed if you are using Databricks only.* + +`mall` will call the appropriate SQL AI function. For more information see our +[Databricks article.](https://edgararuiz.github.io/mall/articles/databricks.html) + +## LLM functions + +### Sentiment + +Primarily, `mall` provides verb-like functions that expect a `tbl` as +their first argument. This allows us to use them in piped operations. + +We will start with loading a very small data set contained in `mall`. It has +3 product reviews that we will use as the source of our examples. + +::: {.panel-tabset group="language"} +## R + +```{r} +#| eval: true + +library(mall) + +data("reviews") + +reviews +``` + +## python + +```{python} +#| eval: true + + +import mall +import polars as pl + +reviews = pl.DataFrame( + data=[ + "This has been the best TV I've ever used. Great screen, and sound.", + "I regret buying this laptop. It is too slow and the keyboard is too noisy", + "Not sure how to feel about my new washing machine. Great color, but hard to figure" + ], + schema=[("review", pl.String)], +) +``` +::: + +For the first example, we'll asses the sentiment of each review. In order to +do this we will call `llm_sentiment()`: + +::: {.panel-tabset group="language"} +## R + +```{r} +#| eval: true + +reviews |> + llm_sentiment(review) +``` + +## python + +```{python} +#| eval: true + +reviews.llm.sentiment("review") +``` + +::: + +The function let's us modify the options to choose from: + +::: {.panel-tabset group="language"} +## R + +```{r} +reviews |> + llm_sentiment(review, options = c("positive", "negative")) +``` + +## python + +```{python} +reviews.llm.sentiment("review", options=["positive", "negative"]) +``` + +::: + +As mentioned before, by being pipe friendly, the results from the LLM prediction +can be used in further transformations: + +```{r} +reviews |> + llm_sentiment(review, options = c("positive", "negative")) |> + filter(.sentiment == "negative") +``` + +### Summarize + +There may be a need to reduce the number of words in a given text. Usually, to +make it easier to capture its intent. To do this, use `llm_summarize()`. This +function has an argument to control the maximum number of words to output +(`max_words`): + +```{r} +reviews |> + llm_summarize(review, max_words = 5) +``` + +To control the name of the prediction field, you can change `pred_name` argument. +This works with the other `llm_` functions as well. + +```{r} +reviews |> + llm_summarize(review, max_words = 5, pred_name = "review_summary") +``` + +### Classify + +Use the LLM to categorize the text into one of the options you provide: + +```{r} +reviews |> + llm_classify(review, c("appliance", "computer")) +``` + +### Extract + +One of the most interesting operations. Using natural language, we can tell the +LLM to return a specific part of the text. In the following example, we request +that the LLM return the product being referred to. We do this by simply saying +"product". The LLM understands what we *mean* by that word, and looks for that +in the text. + + +```{r} +reviews |> + llm_extract(review, "product") +``` + +### Translate + +As the title implies, this function will translate the text into a specified +language. What is really nice, it is that you don't need to specify the language +of the source text. Only the target language needs to be defined. The translation +accuracy will depend on the LLM + +```{r} +reviews |> + llm_translate(review, "spanish") +``` + + +### Custom prompt + +It is possible to pass your own prompt to the LLM, and have `mall` run it +against each text entry. Use `llm_custom()` to access this functionality: + +```{r} +my_prompt <- paste( + "Answer a question.", + "Return only the answer, no explanation", + "Acceptable answers are 'yes', 'no'", + "Answer this about the following text, is this a happy customer?:" +) + +reviews |> + llm_custom(review, my_prompt) +``` + +## Initialize session + +Invoking an `llm_` function will automatically initialize a model selection +if you don't have one selected yet. If there is only one option, it will +pre-select it for you. If there are more than one available models, then `mall` +will present you as menu selection so you can select which model you wish to +use. + +Calling `llm_use()` directly will let you specify the model and backend to use. +You can also setup additional arguments that will be passed down to the +function that actually runs the prediction. In the case of Ollama, that function +is [`chat()`](https://hauselin.github.io/ollama-r/reference/chat.html). + +```{r, eval = FALSE} +llm_use("ollama", "llama3.2", seed = 100, temperature = 0) +``` + +## Key considerations + +The main consideration is **cost**. Either, time cost, or money cost. + +If using this method with an LLM locally available, the cost will be a long +running time. Unless using a very specialized LLM, a given LLM is a general model. +It was fitted using a vast amount of data. So determining a response for each +row, takes longer than if using a manually created NLP model. The default model +used in Ollama is [Llama 3.2](https://ollama.com/library/llama3.2), +which was fitted using 3B parameters. + +If using an external LLM service, the consideration will need to be for the +billing costs of using such service. Keep in mind that you will be sending a lot +of data to be evaluated. + +Another consideration is the novelty of this approach. Early tests are +providing encouraging results. But you, as an user, will still need to keep +in mind that the predictions will not be infallible, so always check the output. +At this time, I think the best use for this method, is for a quick analysis. + +## Performance + +We will briefly cover this methods performance from two perspectives: + +- How long the analysis takes to run locally + +- How well it predicts + +To do so, we will use the `data_bookReviews` data set, provided by the `classmap` +package. For this exercise, only the first 100, of the total 1,000, are going +to be part of this analysis. + +```{r} +library(classmap) + +data(data_bookReviews) + +data_bookReviews |> + glimpse() +``` +As per the docs, `sentiment` is a factor indicating the sentiment of the review: +negative (1) or positive (2) + +```{r} +length(strsplit(paste(head(data_bookReviews$review, 100), collapse = " "), " ")[[1]]) +``` + +Just to get an idea of how much data we're processing, I'm using a very, very +simple word count. So we're analyzing a bit over 20 thousand words. + +```{r} +reviews_llm <- data_bookReviews |> + head(100) |> + llm_sentiment( + col = review, + options = c("positive" ~ 2, "negative" ~ 1), + pred_name = "predicted" + ) +``` + +As far as **time**, on my Apple M3 machine, it took about 1.5 minutes to process, +100 rows, containing 20 thousand words. Setting `temp` to 0 in `llm_use()`, +made the model run faster. + +The package uses `purrr` to send each prompt individually to the LLM. But, I did +try a few different ways to speed up the process, unsuccessfully: + +- Used `furrr` to send multiple requests at a time. This did not work because +either the LLM or Ollama processed all my requests serially. So there was +no improvement. + +- I also tried sending more than one row's text at a time. This cause instability +in the number of results. For example sending 5 at a time, sometimes returned 7 +or 8. Even sending 2 was not stable. + +This is what the new table looks like: + +```{r} +reviews_llm +``` + +I used `yardstick` to see how well the model performed. Of course, the accuracy +will not be of the "truth", but rather the package's results recorded in +`sentiment`. + +```{r} +library(forcats) + +reviews_llm |> + mutate(predicted = as.factor(predicted)) |> + yardstick::accuracy(sentiment, predicted) +``` + +## Vector functions + +`mall` includes functions that expect a vector, instead of a table, to run the +predictions. This should make it easier to test things, such as custom prompts +or results of specific text. Each `llm_` function has a corresponding `llm_vec_` +function: + +```{r} +llm_vec_sentiment("I am happy") +``` + +```{r} +llm_vec_translate("Este es el mejor dia!", "english") +``` + + ::: {.panel-tabset group="language"} ## R diff --git a/r/utils/knitr-print.R b/utils/knitr-print.R similarity index 100% rename from r/utils/knitr-print.R rename to utils/knitr-print.R diff --git a/r/utils/website/README.md b/utils/website/README.md similarity index 100% rename from r/utils/website/README.md rename to utils/website/README.md diff --git a/r/utils/website/_reference.qmd b/utils/website/_reference.qmd similarity index 100% rename from r/utils/website/_reference.qmd rename to utils/website/_reference.qmd diff --git a/r/utils/website/build_reference.R b/utils/website/build_reference.R similarity index 100% rename from r/utils/website/build_reference.R rename to utils/website/build_reference.R diff --git a/r/utils/website/index-page.R b/utils/website/index-page.R similarity index 100% rename from r/utils/website/index-page.R rename to utils/website/index-page.R diff --git a/r/utils/website/list-to-qmd.R b/utils/website/list-to-qmd.R similarity index 100% rename from r/utils/website/list-to-qmd.R rename to utils/website/list-to-qmd.R diff --git a/r/utils/website/rd-to-list.R b/utils/website/rd-to-list.R similarity index 100% rename from r/utils/website/rd-to-list.R rename to utils/website/rd-to-list.R From 4d91edba1d00ced1f28472aef73d37b045128ee1 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Fri, 4 Oct 2024 20:47:55 -0500 Subject: [PATCH 32/57] Adds python data set, activates some code --- index.qmd | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/index.qmd b/index.qmd index b5858a7..c721a9e 100644 --- a/index.qmd +++ b/index.qmd @@ -139,6 +139,25 @@ reviews = pl.DataFrame( ``` ::: +```{python} +import mall +import polars as pl + +reviews = pl.DataFrame( + data=[ + "This has been the best TV I've ever used. Great screen, and sound.", + "I regret buying this laptop. It is too slow and the keyboard is too noisy", + "Not sure how to feel about my new washing machine. Great color, but hard to figure" + ], + schema=[("review", pl.String)], +) +``` + +```{python} +#| include: false +reviews.llm.use(options = dict(seed = 100)) +``` + For the first example, we'll asses the sentiment of each review. In order to do this we will call `llm_sentiment()`: @@ -168,6 +187,8 @@ The function let's us modify the options to choose from: ## R ```{r} +#| eval: true + reviews |> llm_sentiment(review, options = c("positive", "negative")) ``` @@ -175,6 +196,8 @@ reviews |> ## python ```{python} +#| eval: true + reviews.llm.sentiment("review", options=["positive", "negative"]) ``` @@ -183,12 +206,27 @@ reviews.llm.sentiment("review", options=["positive", "negative"]) As mentioned before, by being pipe friendly, the results from the LLM prediction can be used in further transformations: +::: {.panel-tabset group="language"} +## R + ```{r} +#| eval: true + reviews |> llm_sentiment(review, options = c("positive", "negative")) |> filter(.sentiment == "negative") ``` +## python + +```{python} + +reviews.llm.sentiment("review", options=["positive", "negative"]).filter(sentiment == "negative") + +``` + +::: + ### Summarize There may be a need to reduce the number of words in a given text. Usually, to From 1d70ae59eb5c9ab9f5d810b38f38f0be1eb20edc Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sat, 5 Oct 2024 18:06:05 -0500 Subject: [PATCH 33/57] Finishes sentiment code --- index.qmd | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/index.qmd b/index.qmd index c721a9e..8f9c314 100644 --- a/index.qmd +++ b/index.qmd @@ -22,8 +22,6 @@ mall::llm_use("ollama", "llama3.2", seed = 100, .cache = "_readme_cache") -## Intro - Run multiple LLM predictions against a data frame. The predictions are processed row-wise over a specified column. It works using a pre-determined one-shot prompt, along with the current row's content. The prompt that is use will depend of the @@ -139,6 +137,14 @@ reviews = pl.DataFrame( ``` ::: +```{python} +#| include: false +#| eval: true + +reviews.llm.use(options = dict(seed = 100)) +``` + + ```{python} import mall import polars as pl @@ -220,8 +226,11 @@ reviews |> ## python ```{python} +#| eval: true + +x = reviews.llm.sentiment("review", options=["positive", "negative"]) -reviews.llm.sentiment("review", options=["positive", "negative"]).filter(sentiment == "negative") +x.filter(pl.col("sentiment") == "negative") ``` From ce7cb3981db3302ea500bb7e6658af67905121f0 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sat, 5 Oct 2024 18:11:29 -0500 Subject: [PATCH 34/57] Updates summarize code --- index.qmd | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/index.qmd b/index.qmd index 8f9c314..62aae7b 100644 --- a/index.qmd +++ b/index.qmd @@ -171,7 +171,6 @@ do this we will call `llm_sentiment()`: ## R ```{r} -#| eval: true reviews |> llm_sentiment(review) @@ -180,7 +179,6 @@ reviews |> ## python ```{python} -#| eval: true reviews.llm.sentiment("review") ``` @@ -193,7 +191,6 @@ The function let's us modify the options to choose from: ## R ```{r} -#| eval: true reviews |> llm_sentiment(review, options = c("positive", "negative")) @@ -202,7 +199,6 @@ reviews |> ## python ```{python} -#| eval: true reviews.llm.sentiment("review", options=["positive", "negative"]) ``` @@ -216,7 +212,6 @@ can be used in further transformations: ## R ```{r} -#| eval: true reviews |> llm_sentiment(review, options = c("positive", "negative")) |> @@ -226,7 +221,6 @@ reviews |> ## python ```{python} -#| eval: true x = reviews.llm.sentiment("review", options=["positive", "negative"]) @@ -243,19 +237,49 @@ make it easier to capture its intent. To do this, use `llm_summarize()`. This function has an argument to control the maximum number of words to output (`max_words`): +::: {.panel-tabset group="language"} +## R + ```{r} +#| eval: true + reviews |> llm_summarize(review, max_words = 5) ``` +## python + +```{python} +#| eval: true + +reviews.llm.summarize("review", 5) +``` + +::: + To control the name of the prediction field, you can change `pred_name` argument. This works with the other `llm_` functions as well. +::: {.panel-tabset group="language"} +## R + ```{r} +#| eval: true + reviews |> llm_summarize(review, max_words = 5, pred_name = "review_summary") ``` +## python + +```{python} +#| eval: true + +reviews.llm.summarize("review", max_words = 5, pred_name = "review_summary") +``` + +::: + ### Classify Use the LLM to categorize the text into one of the options you provide: From fb308e2a8c810430c5145e287f7735a01e1ac109 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sat, 5 Oct 2024 18:23:35 -0500 Subject: [PATCH 35/57] Adds classify, expands toc --- _quarto.yml | 1 + index.qmd | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/_quarto.yml b/_quarto.yml index 3390754..f03e25c 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -33,6 +33,7 @@ website: format: html: toc: true + toc-expand: true code-copy: true code-overflow: wrap code-toos: true diff --git a/index.qmd b/index.qmd index 62aae7b..6f67b36 100644 --- a/index.qmd +++ b/index.qmd @@ -264,7 +264,6 @@ This works with the other `llm_` functions as well. ## R ```{r} -#| eval: true reviews |> llm_summarize(review, max_words = 5, pred_name = "review_summary") @@ -273,7 +272,6 @@ reviews |> ## python ```{python} -#| eval: true reviews.llm.summarize("review", max_words = 5, pred_name = "review_summary") ``` @@ -284,11 +282,28 @@ reviews.llm.summarize("review", max_words = 5, pred_name = "review_summary") Use the LLM to categorize the text into one of the options you provide: + +::: {.panel-tabset group="language"} +## R + ```{r} +#| eval: true + reviews |> llm_classify(review, c("appliance", "computer")) ``` + +## python + +```{python} +#| eval: true + +reviews.llm.classify("review", ["computer", "appliance"]) +``` + +::: + ### Extract One of the most interesting operations. Using natural language, we can tell the From 3b68d9807b91ae58e4c82a4a8397a3104ae597bf Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sat, 5 Oct 2024 18:30:33 -0500 Subject: [PATCH 36/57] Updates translate and use code --- index.qmd | 106 ++++++++++++++++++++---------------------------------- 1 file changed, 38 insertions(+), 68 deletions(-) diff --git a/index.qmd b/index.qmd index 6f67b36..ddcc4bf 100644 --- a/index.qmd +++ b/index.qmd @@ -287,8 +287,6 @@ Use the LLM to categorize the text into one of the options you provide: ## R ```{r} -#| eval: true - reviews |> llm_classify(review, c("appliance", "computer")) ``` @@ -297,7 +295,6 @@ reviews |> ## python ```{python} -#| eval: true reviews.llm.classify("review", ["computer", "appliance"]) ``` @@ -325,12 +322,27 @@ language. What is really nice, it is that you don't need to specify the language of the source text. Only the target language needs to be defined. The translation accuracy will depend on the LLM +::: {.panel-tabset group="language"} +## R + ```{r} +#| eval: true + reviews |> llm_translate(review, "spanish") ``` +## python + +```{python} +#| eval: true + +reviews.llm.translate("review", "spanish") +``` + +::: + ### Custom prompt It is possible to pass your own prompt to the LLM, and have `mall` run it @@ -361,10 +373,31 @@ You can also setup additional arguments that will be passed down to the function that actually runs the prediction. In the case of Ollama, that function is [`chat()`](https://hauselin.github.io/ollama-r/reference/chat.html). -```{r, eval = FALSE} + + +::: {.panel-tabset group="language"} +## R + +The model to use, and other options can be set for the current R session + +```{r} +#| eval: false llm_use("ollama", "llama3.2", seed = 100, temperature = 0) ``` + +## python + +The model to use, and other options can be set for the specific Polars +data frame + +```{python} +#| eval: false +reviews.llm.use(options = dict(seed = 100)) +``` + +::: + ## Key considerations The main consideration is **cost**. Either, time cost, or money cost. @@ -458,7 +491,7 @@ reviews_llm |> yardstick::accuracy(sentiment, predicted) ``` -## Vector functions +## Vector functions (R only) `mall` includes functions that expect a vector, instead of a table, to run the predictions. This should make it easier to test things, such as custom prompts @@ -473,66 +506,3 @@ llm_vec_sentiment("I am happy") llm_vec_translate("Este es el mejor dia!", "english") ``` - -::: {.panel-tabset group="language"} -## R - -``` {.r} -fizz_buzz <- function(fbnums = 1:50) { - output <- dplyr::case_when( - fbnums %% 15 == 0 ~ "FizzBuzz", - fbnums %% 3 == 0 ~ "Fizz", - fbnums %% 5 == 0 ~ "Buzz", - TRUE ~ as.character(fbnums) - ) - print(output) -} -``` - -## Python - -``` {.python} -def fizz_buzz(num): - if num % 15 == 0: - print("FizzBuzz") - elif num % 5 == 0: - print("Buzz") - elif num % 3 == 0: - print("Fizz") - else: - print(num) -``` - -::: - - -::: {.panel-tabset group="language"} -## R - -``` {.r} -fizz_buzz <- function(fbnums = 1:50) { - output <- dplyr::case_when( - fbnums %% 15 == 0 ~ "FizzBuzz", - fbnums %% 3 == 0 ~ "Fizz", - fbnums %% 5 == 0 ~ "Buzz", - TRUE ~ as.character(fbnums) - ) - print(output) -} -``` - -## Python - -``` {.python} -def fizz_buzz(num): - if num % 15 == 0: - print("FizzBuzz") - elif num % 5 == 0: - print("Buzz") - elif num % 3 == 0: - print("Fizz") - else: - print(num) -``` - -::: From 3b8b50c3f81aa5ace0294ebcb10bce29635fdc46 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sat, 5 Oct 2024 18:36:49 -0500 Subject: [PATCH 37/57] Removes duplicated code --- index.qmd | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/index.qmd b/index.qmd index ddcc4bf..344f2f8 100644 --- a/index.qmd +++ b/index.qmd @@ -144,26 +144,6 @@ reviews = pl.DataFrame( reviews.llm.use(options = dict(seed = 100)) ``` - -```{python} -import mall -import polars as pl - -reviews = pl.DataFrame( - data=[ - "This has been the best TV I've ever used. Great screen, and sound.", - "I regret buying this laptop. It is too slow and the keyboard is too noisy", - "Not sure how to feel about my new washing machine. Great color, but hard to figure" - ], - schema=[("review", pl.String)], -) -``` - -```{python} -#| include: false -reviews.llm.use(options = dict(seed = 100)) -``` - For the first example, we'll asses the sentiment of each review. In order to do this we will call `llm_sentiment()`: From ea1924fb801fe20eb6d83e77de0d9f604c92cda7 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sat, 5 Oct 2024 18:49:44 -0500 Subject: [PATCH 38/57] First experimental quartodoc run --- _quarto.yml | 16 +++++++++++++++- index.qmd | 16 ++++++++-------- objects.json | 1 + reference/MallFrame.qmd | 27 +++++++++++++++++++++++++++ reference/_api_index.qmd | 9 +++++++++ 5 files changed, 60 insertions(+), 9 deletions(-) create mode 100644 objects.json create mode 100644 reference/MallFrame.qmd create mode 100644 reference/_api_index.qmd diff --git a/_quarto.yml b/_quarto.yml index f03e25c..90c5d59 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -45,4 +45,18 @@ format: knitr: opts_chunk: collapse: true - comment: "#>" \ No newline at end of file + comment: "#>" + +quartodoc: + package: mall + options: null + style: pkgdown + dir: reference + out_index: _api_index.qmd + dynamic: true + sections: + - title: mall + desc: '' + contents: + - name: MallFrame + diff --git a/index.qmd b/index.qmd index 344f2f8..4850810 100644 --- a/index.qmd +++ b/index.qmd @@ -9,7 +9,7 @@ execute: ```{r} -#| eval: true + #| include: false library(dplyr) library(dbplyr) @@ -108,7 +108,7 @@ We will start with loading a very small data set contained in `mall`. It has ## R ```{r} -#| eval: true + library(mall) @@ -120,7 +120,7 @@ reviews ## python ```{python} -#| eval: true + import mall @@ -139,7 +139,7 @@ reviews = pl.DataFrame( ```{python} #| include: false -#| eval: true + reviews.llm.use(options = dict(seed = 100)) ``` @@ -221,7 +221,7 @@ function has an argument to control the maximum number of words to output ## R ```{r} -#| eval: true + reviews |> llm_summarize(review, max_words = 5) @@ -230,7 +230,7 @@ reviews |> ## python ```{python} -#| eval: true + reviews.llm.summarize("review", 5) ``` @@ -306,7 +306,7 @@ accuracy will depend on the LLM ## R ```{r} -#| eval: true + reviews |> llm_translate(review, "spanish") @@ -316,7 +316,7 @@ reviews |> ## python ```{python} -#| eval: true + reviews.llm.translate("review", "spanish") ``` diff --git a/objects.json b/objects.json new file mode 100644 index 0000000..9d36587 --- /dev/null +++ b/objects.json @@ -0,0 +1 @@ +{"project": "mall", "version": "0.0.9999", "count": 4, "items": [{"name": "mall.MallFrame.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.translate", "dispname": "-"}, {"name": "mall.polars.MallFrame.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.translate", "dispname": "mall.MallFrame.translate"}, {"name": "mall.MallFrame", "domain": "py", "role": "class", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame", "dispname": "-"}, {"name": "mall.polars.MallFrame", "domain": "py", "role": "class", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame", "dispname": "mall.MallFrame"}]} \ No newline at end of file diff --git a/reference/MallFrame.qmd b/reference/MallFrame.qmd new file mode 100644 index 0000000..bcd2cba --- /dev/null +++ b/reference/MallFrame.qmd @@ -0,0 +1,27 @@ +# MallFrame { #mall.MallFrame } + +`MallFrame(self, df)` + +Extension to Polars that add ability to use +an LLM to run batch predictions over a data frame + +## Methods + +| Name | Description | +| --- | --- | +| [translate](#mall.MallFrame.translate) | Translate text into another language. | + +### translate { #mall.MallFrame.translate } + +`MallFrame.translate(col, language='', additional='', pred_name='translation')` + +Translate text into another language. + +#### Parameters + +| Name | Type | Description | Default | +|--------------|--------|----------------------------------------------------------------------------------------|-----------------| +| `col` | | The name of the text field to process | _required_ | +| `language` | | The target language to translate to. For example 'French'. | `''` | +| `pred_name` | | A character vector with the name of the new column where the prediction will be placed | `'translation'` | +| `additional` | | Inserts this text into the prompt sent to the LLM | `''` | \ No newline at end of file diff --git a/reference/_api_index.qmd b/reference/_api_index.qmd new file mode 100644 index 0000000..901fdf5 --- /dev/null +++ b/reference/_api_index.qmd @@ -0,0 +1,9 @@ +# Function reference {.doc .doc-index} + +## mall + + + +| | | +| --- | --- | +| [MallFrame](MallFrame.qmd#mall.MallFrame) | Extension to Polars that add ability to use | \ No newline at end of file From 86dc7b19d44941b77705b1c1673cc85c2b9462a2 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sat, 5 Oct 2024 19:03:15 -0500 Subject: [PATCH 39/57] Experiments with a unified reference index, it looks like I'm gonna need a template --- _quarto.yml | 2 +- index.qmd | 3 +- reference/index.qmd | 52 +++---------------- .../{_api_index.qmd => python_index.qmd} | 8 +-- reference/r_index.qmd | 47 +++++++++++++++++ 5 files changed, 61 insertions(+), 51 deletions(-) rename reference/{_api_index.qmd => python_index.qmd} (53%) create mode 100644 reference/r_index.qmd diff --git a/_quarto.yml b/_quarto.yml index 90c5d59..4066be6 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -52,7 +52,7 @@ quartodoc: options: null style: pkgdown dir: reference - out_index: _api_index.qmd + out_index: python_index.qmd dynamic: true sections: - title: mall diff --git a/index.qmd b/index.qmd index 4850810..7cded57 100644 --- a/index.qmd +++ b/index.qmd @@ -9,8 +9,9 @@ execute: ```{r} - #| include: false +#| eval: true + library(dplyr) library(dbplyr) library(tictoc) diff --git a/reference/index.qmd b/reference/index.qmd index 0c8d4bc..0044f9d 100644 --- a/reference/index.qmd +++ b/reference/index.qmd @@ -1,47 +1,9 @@ ---- -toc: false ---- - - -# Function Reference - -[llm_classify()](llm_classify.html) [llm_vec_classify()](llm_classify.html) - -      Categorize data as one of options given - - -[llm_custom()](llm_custom.html) [llm_vec_custom()](llm_custom.html) - -      Send a custom prompt to the LLM - - -[llm_extract()](llm_extract.html) [llm_vec_extract()](llm_extract.html) - -      Extract entities from text - - -[llm_sentiment()](llm_sentiment.html) [llm_vec_sentiment()](llm_sentiment.html) - -      Sentiment analysis - - -[llm_summarize()](llm_summarize.html) [llm_vec_summarize()](llm_summarize.html) - -      Summarize text - - -[llm_translate()](llm_translate.html) [llm_vec_translate()](llm_translate.html) - -      Translates text to a specific language - - -[llm_use()](llm_use.html) - -      Specify the model to use - - -[reviews](reviews.html) - -      Mini reviews data set +::: {.panel-tabset group="language"} +## R +{{< include r_index.qmd >}} +## python +{{< include python_index.qmd >}} +::: + \ No newline at end of file diff --git a/reference/_api_index.qmd b/reference/python_index.qmd similarity index 53% rename from reference/_api_index.qmd rename to reference/python_index.qmd index 901fdf5..2bca752 100644 --- a/reference/_api_index.qmd +++ b/reference/python_index.qmd @@ -1,7 +1,7 @@ -# Function reference {.doc .doc-index} - -## mall - +--- +toc: false +--- + | | | diff --git a/reference/r_index.qmd b/reference/r_index.qmd new file mode 100644 index 0000000..3c5c731 --- /dev/null +++ b/reference/r_index.qmd @@ -0,0 +1,47 @@ +--- +toc: false +--- + + +# Function Reference + +[llm_classify()](llm_classify.html) [llm_vec_classify()](llm_classify.html) + +      Categorize data as one of options given + + +[llm_custom()](llm_custom.html) [llm_vec_custom()](llm_custom.html) + +      Send a custom prompt to the LLM + + +[llm_extract()](llm_extract.html) [llm_vec_extract()](llm_extract.html) + +      Extract entities from text + + +[llm_sentiment()](llm_sentiment.html) [llm_vec_sentiment()](llm_sentiment.html) + +      Sentiment analysis + + +[llm_summarize()](llm_summarize.html) [llm_vec_summarize()](llm_summarize.html) + +      Summarize text + + +[llm_translate()](llm_translate.html) [llm_vec_translate()](llm_translate.html) + +      Translates text to a specific language + + +[llm_use()](llm_use.html) + +      Specify the model to use + + +[reviews](reviews.html) + +      Mini reviews data set + + From 4195aa51bfbc0c4ea4adfd14813892184cdb0d86 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sun, 6 Oct 2024 13:48:04 -0500 Subject: [PATCH 40/57] Formalizes index pages --- _freeze/reference/llm_classify/execute-results/html.json | 2 +- _freeze/reference/llm_custom/execute-results/html.json | 2 +- _freeze/reference/llm_extract/execute-results/html.json | 2 +- _freeze/reference/llm_sentiment/execute-results/html.json | 2 +- _freeze/reference/llm_summarize/execute-results/html.json | 2 +- _freeze/reference/llm_translate/execute-results/html.json | 2 +- _freeze/reference/llm_use/execute-results/html.json | 2 +- .../reference/m_backend_submit/execute-results/html.json | 2 +- _freeze/reference/reviews/execute-results/html.json | 2 +- _quarto.yml | 2 +- reference/{python_index.qmd => _api_index.qmd} | 8 ++++---- reference/index.qmd | 8 +++++++- reference/r_index.qmd | 4 +--- utils/website/README.md | 4 ++-- utils/website/build_reference.R | 8 ++++---- utils/website/index-page.R | 4 +--- 16 files changed, 29 insertions(+), 27 deletions(-) rename reference/{python_index.qmd => _api_index.qmd} (53%) diff --git a/_freeze/reference/llm_classify/execute-results/html.json b/_freeze/reference/llm_classify/execute-results/html.json index 82d68b1..abcbed0 100644 --- a/_freeze/reference/llm_classify/execute-results/html.json +++ b/_freeze/reference/llm_classify/execute-results/html.json @@ -2,7 +2,7 @@ "hash": "2654553ad72a6ca1b62748f913913568", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Categorize data as one of options given\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n[R/llm-classify.R](https://github.com/edgararuiz/mall/blob/main/R/llm-classify.R)\n\n## llm_classify\n\n## Description\n Use a Large Language Model (LLM) to classify the provided text as one of the options provided via the `labels` argument. \n\n\n## Usage\n```r\n \nllm_classify( \n .data, \n col, \n labels, \n pred_name = \".classify\", \n additional_prompt = \"\" \n) \n \nllm_vec_classify(x, labels, additional_prompt = \"\", preview = FALSE) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| .data | A `data.frame` or `tbl` object that contains the text to be analyzed |\n| col | The name of the field to analyze, supports `tidy-eval` |\n| labels | A character vector with at least 2 labels to classify the text as |\n| pred_name | A character vector with the name of the new column where the prediction will be placed |\n| additional_prompt | Inserts this text into the prompt sent to the LLM |\n| x | A vector that contains the text to be analyzed |\n| preview | It returns the R call that would have been used to run the prediction. It only returns the first record in `x`. Defaults to `FALSE` Applies to vector function only. |\n\n\n\n## Value\n `llm_classify` returns a `data.frame` or `tbl` object. `llm_vec_classify` returns a vector that is the same length as `x`. \n\n\n## Examples\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \nllm_classify(reviews, review, c(\"appliance\", \"computer\")) \n#> # A tibble: 3 × 2\n#> review .classify\n#> \n#> 1 This has been the best TV I've ever used. Gr… computer \n#> 2 I regret buying this laptop. It is too slow … computer \n#> 3 Not sure how to feel about my new washing ma… appliance\n \n# Use 'pred_name' to customize the new column's name \nllm_classify( \n reviews, \n review, \n c(\"appliance\", \"computer\"), \n pred_name = \"prod_type\" \n) \n#> # A tibble: 3 × 2\n#> review prod_type\n#> \n#> 1 This has been the best TV I've ever used. Gr… computer \n#> 2 I regret buying this laptop. It is too slow … computer \n#> 3 Not sure how to feel about my new washing ma… appliance\n \n# Pass custom values for each classification \nllm_classify(reviews, review, c(\"appliance\" ~ 1, \"computer\" ~ 2)) \n#> # A tibble: 3 × 2\n#> review .classify\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. 1\n#> 2 I regret buying this laptop. It is too slow and the keyboard is too… 2\n#> 3 Not sure how to feel about my new washing machine. Great color, but… 1\n \n# For character vectors, instead of a data frame, use this function \nllm_vec_classify( \n c(\"this is important!\", \"just whenever\"), \n c(\"urgent\", \"not urgent\") \n) \n#> [1] \"urgent\" \"urgent\"\n \n# To preview the first call that will be made to the downstream R function \nllm_vec_classify( \n c(\"this is important!\", \"just whenever\"), \n c(\"urgent\", \"not urgent\"), \n preview = TRUE \n) \n#> ollamar::chat(messages = list(list(role = \"user\", content = \"You are a helpful classification engine. Determine if the text refers to one of the following: urgent, not urgent. No capitalization. No explanations. The answer is based on the following text:\\nthis is important!\")), \n#> output = \"text\", model = \"llama3.2\", seed = 100)\n```\n:::\n", + "markdown": "---\ntitle: \"Categorize data as one of options given\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n[R/llm-classify.R](https://github.com/edgararuiz/mall/blob/main/R/llm-classify.R)\n\n## llm_classify\n\n## Description\n Use a Large Language Model (LLM) to classify the provided text as one of the options provided via the `labels` argument. \n\n\n## Usage\n```r\n \nllm_classify( \n .data, \n col, \n labels, \n pred_name = \".classify\", \n additional_prompt = \"\" \n) \n \nllm_vec_classify(x, labels, additional_prompt = \"\", preview = FALSE) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| .data | A `data.frame` or `tbl` object that contains the text to be analyzed |\n| col | The name of the field to analyze, supports `tidy-eval` |\n| labels | A character vector with at least 2 labels to classify the text as |\n| pred_name | A character vector with the name of the new column where the prediction will be placed |\n| additional_prompt | Inserts this text into the prompt sent to the LLM |\n| x | A vector that contains the text to be analyzed |\n| preview | It returns the R call that would have been used to run the prediction. It only returns the first record in `x`. Defaults to `FALSE` Applies to vector function only. |\n\n\n\n## Value\n `llm_classify` returns a `data.frame` or `tbl` object. `llm_vec_classify` returns a vector that is the same length as `x`. \n\n\n## Examples\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \nllm_classify(reviews, review, c(\"appliance\", \"computer\")) \n#> # A tibble: 3 × 2\n#> review .classify\n#> \n#> 1 This has been the best TV I've ever used. Gr… computer \n#> 2 I regret buying this laptop. It is too slow … computer \n#> 3 Not sure how to feel about my new washing ma… appliance\n \n# Use 'pred_name' to customize the new column's name \nllm_classify( \n reviews, \n review, \n c(\"appliance\", \"computer\"), \n pred_name = \"prod_type\" \n) \n#> # A tibble: 3 × 2\n#> review prod_type\n#> \n#> 1 This has been the best TV I've ever used. Gr… computer \n#> 2 I regret buying this laptop. It is too slow … computer \n#> 3 Not sure how to feel about my new washing ma… appliance\n \n# Pass custom values for each classification \nllm_classify(reviews, review, c(\"appliance\" ~ 1, \"computer\" ~ 2)) \n#> # A tibble: 3 × 2\n#> review .classify\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. 1\n#> 2 I regret buying this laptop. It is too slow and the keyboard is too… 2\n#> 3 Not sure how to feel about my new washing machine. Great color, but… 1\n \n# For character vectors, instead of a data frame, use this function \nllm_vec_classify( \n c(\"this is important!\", \"just whenever\"), \n c(\"urgent\", \"not urgent\") \n) \n#> [1] \"urgent\" \"urgent\"\n \n# To preview the first call that will be made to the downstream R function \nllm_vec_classify( \n c(\"this is important!\", \"just whenever\"), \n c(\"urgent\", \"not urgent\"), \n preview = TRUE \n) \n#> ollamar::chat(messages = list(list(role = \"user\", content = \"You are a helpful classification engine. Determine if the text refers to one of the following: urgent, not urgent. No capitalization. No explanations. The answer is based on the following text:\\nthis is important!\")), \n#> output = \"text\", model = \"llama3.2\", seed = 100)\n```\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/reference/llm_custom/execute-results/html.json b/_freeze/reference/llm_custom/execute-results/html.json index e746465..c85f7b9 100644 --- a/_freeze/reference/llm_custom/execute-results/html.json +++ b/_freeze/reference/llm_custom/execute-results/html.json @@ -2,7 +2,7 @@ "hash": "9f9fb9cfdaebdc5ea55c78df85881f4f", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Send a custom prompt to the LLM\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n[R/llm-custom.R](https://github.com/edgararuiz/mall/blob/main/R/llm-custom.R)\n\n## llm_custom\n\n## Description\n Use a Large Language Model (LLM) to process the provided text using the instructions from `prompt` \n\n\n## Usage\n```r\n \nllm_custom(.data, col, prompt = \"\", pred_name = \".pred\", valid_resps = \"\") \n \nllm_vec_custom(x, prompt = \"\", valid_resps = NULL) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| .data | A `data.frame` or `tbl` object that contains the text to be analyzed |\n| col | The name of the field to analyze, supports `tidy-eval` |\n| prompt | The prompt to append to each record sent to the LLM |\n| pred_name | A character vector with the name of the new column where the prediction will be placed |\n| valid_resps | If the response from the LLM is not open, but deterministic, provide the options in a vector. This function will set to `NA` any response not in the options |\n| x | A vector that contains the text to be analyzed |\n\n\n\n## Value\n `llm_custom` returns a `data.frame` or `tbl` object. `llm_vec_custom` returns a vector that is the same length as `x`. \n\n\n## Examples\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \nmy_prompt <- paste( \n \"Answer a question.\", \n \"Return only the answer, no explanation\", \n \"Acceptable answers are 'yes', 'no'\", \n \"Answer this about the following text, is this a happy customer?:\" \n) \n \nreviews |> \n llm_custom(review, my_prompt) \n#> # A tibble: 3 × 2\n#> review .pred\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. Yes \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noi… No \n#> 3 Not sure how to feel about my new washing machine. Great color, but har… No\n```\n:::\n", + "markdown": "---\ntitle: \"Send a custom prompt to the LLM\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n[R/llm-custom.R](https://github.com/edgararuiz/mall/blob/main/R/llm-custom.R)\n\n## llm_custom\n\n## Description\n Use a Large Language Model (LLM) to process the provided text using the instructions from `prompt` \n\n\n## Usage\n```r\n \nllm_custom(.data, col, prompt = \"\", pred_name = \".pred\", valid_resps = \"\") \n \nllm_vec_custom(x, prompt = \"\", valid_resps = NULL) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| .data | A `data.frame` or `tbl` object that contains the text to be analyzed |\n| col | The name of the field to analyze, supports `tidy-eval` |\n| prompt | The prompt to append to each record sent to the LLM |\n| pred_name | A character vector with the name of the new column where the prediction will be placed |\n| valid_resps | If the response from the LLM is not open, but deterministic, provide the options in a vector. This function will set to `NA` any response not in the options |\n| x | A vector that contains the text to be analyzed |\n\n\n\n## Value\n `llm_custom` returns a `data.frame` or `tbl` object. `llm_vec_custom` returns a vector that is the same length as `x`. \n\n\n## Examples\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \nmy_prompt <- paste( \n \"Answer a question.\", \n \"Return only the answer, no explanation\", \n \"Acceptable answers are 'yes', 'no'\", \n \"Answer this about the following text, is this a happy customer?:\" \n) \n \nreviews |> \n llm_custom(review, my_prompt) \n#> # A tibble: 3 × 2\n#> review .pred\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. Yes \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noi… No \n#> 3 Not sure how to feel about my new washing machine. Great color, but har… No\n```\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/reference/llm_extract/execute-results/html.json b/_freeze/reference/llm_extract/execute-results/html.json index 5ff0d5e..85df9d0 100644 --- a/_freeze/reference/llm_extract/execute-results/html.json +++ b/_freeze/reference/llm_extract/execute-results/html.json @@ -2,7 +2,7 @@ "hash": "fa18360fb7c78438dcd9a82a136ef52a", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Extract entities from text\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n[R/llm-extract.R](https://github.com/edgararuiz/mall/blob/main/R/llm-extract.R)\n\n## llm_extract\n\n## Description\n Use a Large Language Model (LLM) to extract specific entity, or entities, from the provided text \n\n\n## Usage\n```r\n \nllm_extract( \n .data, \n col, \n labels, \n expand_cols = FALSE, \n additional_prompt = \"\", \n pred_name = \".extract\" \n) \n \nllm_vec_extract(x, labels = c(), additional_prompt = \"\", preview = FALSE) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| .data | A `data.frame` or `tbl` object that contains the text to be analyzed |\n| col | The name of the field to analyze, supports `tidy-eval` |\n| labels | A vector with the entities to extract from the text |\n| expand_cols | If multiple `labels` are passed, this is a flag that tells the function to create a new column per item in `labels`. If `labels` is a named vector, this function will use those names as the new column names, if not, the function will use a sanitized version of the content as the name. |\n| additional_prompt | Inserts this text into the prompt sent to the LLM |\n| pred_name | A character vector with the name of the new column where the prediction will be placed |\n| x | A vector that contains the text to be analyzed |\n| preview | It returns the R call that would have been used to run the prediction. It only returns the first record in `x`. Defaults to `FALSE` Applies to vector function only. |\n\n\n\n## Value\n `llm_extract` returns a `data.frame` or `tbl` object. `llm_vec_extract` returns a vector that is the same length as `x`. \n\n\n## Examples\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \n# Use 'labels' to let the function know what to extract \nllm_extract(reviews, review, labels = \"product\") \n#> # A tibble: 3 × 2\n#> review .extract \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n \n# Use 'pred_name' to customize the new column's name \nllm_extract(reviews, review, \"product\", pred_name = \"prod\") \n#> # A tibble: 3 × 2\n#> review prod \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n \n# Pass a vector to request multiple things, the results will be pipe delimeted \n# in a single column \nllm_extract(reviews, review, c(\"product\", \"feelings\")) \n#> # A tibble: 3 × 2\n#> review .extract \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv | great \n#> 2 I regret buying this laptop. It is too slow … laptop|frustration \n#> 3 Not sure how to feel about my new washing ma… washing machine | confusion\n \n# To get multiple columns, use 'expand_cols' \nllm_extract(reviews, review, c(\"product\", \"feelings\"), expand_cols = TRUE) \n#> # A tibble: 3 × 3\n#> review product feelings \n#> \n#> 1 This has been the best TV I've ever used. Gr… \"tv \" \" great\" \n#> 2 I regret buying this laptop. It is too slow … \"laptop\" \"frustration\"\n#> 3 Not sure how to feel about my new washing ma… \"washing machine \" \" confusion\"\n \n# Pass a named vector to set the resulting column names \nllm_extract( \n .data = reviews, \n col = review, \n labels = c(prod = \"product\", feels = \"feelings\"), \n expand_cols = TRUE \n) \n#> # A tibble: 3 × 3\n#> review prod feels \n#> \n#> 1 This has been the best TV I've ever used. Gr… \"tv \" \" great\" \n#> 2 I regret buying this laptop. It is too slow … \"laptop\" \"frustration\"\n#> 3 Not sure how to feel about my new washing ma… \"washing machine \" \" confusion\"\n \n# For character vectors, instead of a data frame, use this function \nllm_vec_extract(\"bob smith, 123 3rd street\", c(\"name\", \"address\")) \n#> [1] \"bob smith | 123 3rd street\"\n \n# To preview the first call that will be made to the downstream R function \nllm_vec_extract( \n \"bob smith, 123 3rd street\", \n c(\"name\", \"address\"), \n preview = TRUE \n) \n#> ollamar::chat(messages = list(list(role = \"user\", content = \"You are a helpful text extraction engine. Extract the name, address being referred to on the text. I expect 2 items exactly. No capitalization. No explanations. Return the response exclusively in a pipe separated list, and no headers. The answer is based on the following text:\\nbob smith, 123 3rd street\")), \n#> output = \"text\", model = \"llama3.2\", seed = 100)\n```\n:::\n", + "markdown": "---\ntitle: \"Extract entities from text\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n[R/llm-extract.R](https://github.com/edgararuiz/mall/blob/main/R/llm-extract.R)\n\n## llm_extract\n\n## Description\n Use a Large Language Model (LLM) to extract specific entity, or entities, from the provided text \n\n\n## Usage\n```r\n \nllm_extract( \n .data, \n col, \n labels, \n expand_cols = FALSE, \n additional_prompt = \"\", \n pred_name = \".extract\" \n) \n \nllm_vec_extract(x, labels = c(), additional_prompt = \"\", preview = FALSE) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| .data | A `data.frame` or `tbl` object that contains the text to be analyzed |\n| col | The name of the field to analyze, supports `tidy-eval` |\n| labels | A vector with the entities to extract from the text |\n| expand_cols | If multiple `labels` are passed, this is a flag that tells the function to create a new column per item in `labels`. If `labels` is a named vector, this function will use those names as the new column names, if not, the function will use a sanitized version of the content as the name. |\n| additional_prompt | Inserts this text into the prompt sent to the LLM |\n| pred_name | A character vector with the name of the new column where the prediction will be placed |\n| x | A vector that contains the text to be analyzed |\n| preview | It returns the R call that would have been used to run the prediction. It only returns the first record in `x`. Defaults to `FALSE` Applies to vector function only. |\n\n\n\n## Value\n `llm_extract` returns a `data.frame` or `tbl` object. `llm_vec_extract` returns a vector that is the same length as `x`. \n\n\n## Examples\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \n# Use 'labels' to let the function know what to extract \nllm_extract(reviews, review, labels = \"product\") \n#> # A tibble: 3 × 2\n#> review .extract \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n \n# Use 'pred_name' to customize the new column's name \nllm_extract(reviews, review, \"product\", pred_name = \"prod\") \n#> # A tibble: 3 × 2\n#> review prod \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n \n# Pass a vector to request multiple things, the results will be pipe delimeted \n# in a single column \nllm_extract(reviews, review, c(\"product\", \"feelings\")) \n#> # A tibble: 3 × 2\n#> review .extract \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv | great \n#> 2 I regret buying this laptop. It is too slow … laptop|frustration \n#> 3 Not sure how to feel about my new washing ma… washing machine | confusion\n \n# To get multiple columns, use 'expand_cols' \nllm_extract(reviews, review, c(\"product\", \"feelings\"), expand_cols = TRUE) \n#> # A tibble: 3 × 3\n#> review product feelings \n#> \n#> 1 This has been the best TV I've ever used. Gr… \"tv \" \" great\" \n#> 2 I regret buying this laptop. It is too slow … \"laptop\" \"frustration\"\n#> 3 Not sure how to feel about my new washing ma… \"washing machine \" \" confusion\"\n \n# Pass a named vector to set the resulting column names \nllm_extract( \n .data = reviews, \n col = review, \n labels = c(prod = \"product\", feels = \"feelings\"), \n expand_cols = TRUE \n) \n#> # A tibble: 3 × 3\n#> review prod feels \n#> \n#> 1 This has been the best TV I've ever used. Gr… \"tv \" \" great\" \n#> 2 I regret buying this laptop. It is too slow … \"laptop\" \"frustration\"\n#> 3 Not sure how to feel about my new washing ma… \"washing machine \" \" confusion\"\n \n# For character vectors, instead of a data frame, use this function \nllm_vec_extract(\"bob smith, 123 3rd street\", c(\"name\", \"address\")) \n#> [1] \"bob smith | 123 3rd street\"\n \n# To preview the first call that will be made to the downstream R function \nllm_vec_extract( \n \"bob smith, 123 3rd street\", \n c(\"name\", \"address\"), \n preview = TRUE \n) \n#> ollamar::chat(messages = list(list(role = \"user\", content = \"You are a helpful text extraction engine. Extract the name, address being referred to on the text. I expect 2 items exactly. No capitalization. No explanations. Return the response exclusively in a pipe separated list, and no headers. The answer is based on the following text:\\nbob smith, 123 3rd street\")), \n#> output = \"text\", model = \"llama3.2\", seed = 100)\n```\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/reference/llm_sentiment/execute-results/html.json b/_freeze/reference/llm_sentiment/execute-results/html.json index bff4d77..5247b0c 100644 --- a/_freeze/reference/llm_sentiment/execute-results/html.json +++ b/_freeze/reference/llm_sentiment/execute-results/html.json @@ -2,7 +2,7 @@ "hash": "ce5d3cf8515ce8aee247eeb4715bcbc0", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Sentiment analysis\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n[R/llm-sentiment.R](https://github.com/edgararuiz/mall/blob/main/R/llm-sentiment.R)\n\n## llm_sentiment\n\n## Description\n Use a Large Language Model (LLM) to perform sentiment analysis from the provided text \n\n\n## Usage\n```r\n \nllm_sentiment( \n .data, \n col, \n options = c(\"positive\", \"negative\", \"neutral\"), \n pred_name = \".sentiment\", \n additional_prompt = \"\" \n) \n \nllm_vec_sentiment( \n x, \n options = c(\"positive\", \"negative\", \"neutral\"), \n additional_prompt = \"\", \n preview = FALSE \n) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| .data | A `data.frame` or `tbl` object that contains the text to be analyzed |\n| col | The name of the field to analyze, supports `tidy-eval` |\n| options | A vector with the options that the LLM should use to assign a sentiment to the text. Defaults to: 'positive', 'negative', 'neutral' |\n| pred_name | A character vector with the name of the new column where the prediction will be placed |\n| additional_prompt | Inserts this text into the prompt sent to the LLM |\n| x | A vector that contains the text to be analyzed |\n| preview | It returns the R call that would have been used to run the prediction. It only returns the first record in `x`. Defaults to `FALSE` Applies to vector function only. |\n\n\n\n## Value\n `llm_sentiment` returns a `data.frame` or `tbl` object. `llm_vec_sentiment` returns a vector that is the same length as `x`. \n\n\n## Examples\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \nllm_sentiment(reviews, review) \n#> # A tibble: 3 × 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… neutral\n \n# Use 'pred_name' to customize the new column's name \nllm_sentiment(reviews, review, pred_name = \"review_sentiment\") \n#> # A tibble: 3 × 2\n#> review review_sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and … positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard… negative \n#> 3 Not sure how to feel about my new washing machine. Great col… neutral\n \n# Pass custom sentiment options \nllm_sentiment(reviews, review, c(\"positive\", \"negative\")) \n#> # A tibble: 3 × 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… negative\n \n# Specify values to return per sentiment \nllm_sentiment(reviews, review, c(\"positive\" ~ 1, \"negative\" ~ 0)) \n#> # A tibble: 3 × 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. 1\n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… 0\n#> 3 Not sure how to feel about my new washing machine. Great color, bu… 0\n \n# For character vectors, instead of a data frame, use this function \nllm_vec_sentiment(c(\"I am happy\", \"I am sad\")) \n#> [1] \"positive\" \"negative\"\n \n# To preview the first call that will be made to the downstream R function \nllm_vec_sentiment(c(\"I am happy\", \"I am sad\"), preview = TRUE) \n#> ollamar::chat(messages = list(list(role = \"user\", content = \"You are a helpful sentiment engine. Return only one of the following answers: positive, negative, neutral. No capitalization. No explanations. The answer is based on the following text:\\nI am happy\")), \n#> output = \"text\", model = \"llama3.2\", seed = 100)\n```\n:::\n", + "markdown": "---\ntitle: \"Sentiment analysis\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n[R/llm-sentiment.R](https://github.com/edgararuiz/mall/blob/main/R/llm-sentiment.R)\n\n## llm_sentiment\n\n## Description\n Use a Large Language Model (LLM) to perform sentiment analysis from the provided text \n\n\n## Usage\n```r\n \nllm_sentiment( \n .data, \n col, \n options = c(\"positive\", \"negative\", \"neutral\"), \n pred_name = \".sentiment\", \n additional_prompt = \"\" \n) \n \nllm_vec_sentiment( \n x, \n options = c(\"positive\", \"negative\", \"neutral\"), \n additional_prompt = \"\", \n preview = FALSE \n) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| .data | A `data.frame` or `tbl` object that contains the text to be analyzed |\n| col | The name of the field to analyze, supports `tidy-eval` |\n| options | A vector with the options that the LLM should use to assign a sentiment to the text. Defaults to: 'positive', 'negative', 'neutral' |\n| pred_name | A character vector with the name of the new column where the prediction will be placed |\n| additional_prompt | Inserts this text into the prompt sent to the LLM |\n| x | A vector that contains the text to be analyzed |\n| preview | It returns the R call that would have been used to run the prediction. It only returns the first record in `x`. Defaults to `FALSE` Applies to vector function only. |\n\n\n\n## Value\n `llm_sentiment` returns a `data.frame` or `tbl` object. `llm_vec_sentiment` returns a vector that is the same length as `x`. \n\n\n## Examples\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \nllm_sentiment(reviews, review) \n#> # A tibble: 3 × 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… neutral\n \n# Use 'pred_name' to customize the new column's name \nllm_sentiment(reviews, review, pred_name = \"review_sentiment\") \n#> # A tibble: 3 × 2\n#> review review_sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and … positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard… negative \n#> 3 Not sure how to feel about my new washing machine. Great col… neutral\n \n# Pass custom sentiment options \nllm_sentiment(reviews, review, c(\"positive\", \"negative\")) \n#> # A tibble: 3 × 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… negative\n \n# Specify values to return per sentiment \nllm_sentiment(reviews, review, c(\"positive\" ~ 1, \"negative\" ~ 0)) \n#> # A tibble: 3 × 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. 1\n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… 0\n#> 3 Not sure how to feel about my new washing machine. Great color, bu… 0\n \n# For character vectors, instead of a data frame, use this function \nllm_vec_sentiment(c(\"I am happy\", \"I am sad\")) \n#> [1] \"positive\" \"negative\"\n \n# To preview the first call that will be made to the downstream R function \nllm_vec_sentiment(c(\"I am happy\", \"I am sad\"), preview = TRUE) \n#> ollamar::chat(messages = list(list(role = \"user\", content = \"You are a helpful sentiment engine. Return only one of the following answers: positive, negative, neutral. No capitalization. No explanations. The answer is based on the following text:\\nI am happy\")), \n#> output = \"text\", model = \"llama3.2\", seed = 100)\n```\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/reference/llm_summarize/execute-results/html.json b/_freeze/reference/llm_summarize/execute-results/html.json index 97d568e..78fa764 100644 --- a/_freeze/reference/llm_summarize/execute-results/html.json +++ b/_freeze/reference/llm_summarize/execute-results/html.json @@ -2,7 +2,7 @@ "hash": "7dcf1326b18f3451fa4dc840a052e68e", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Summarize text\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n[R/llm-summarize.R](https://github.com/edgararuiz/mall/blob/main/R/llm-summarize.R)\n\n## llm_summarize\n\n## Description\n Use a Large Language Model (LLM) to summarize text \n\n\n## Usage\n```r\n \nllm_summarize( \n .data, \n col, \n max_words = 10, \n pred_name = \".summary\", \n additional_prompt = \"\" \n) \n \nllm_vec_summarize(x, max_words = 10, additional_prompt = \"\", preview = FALSE) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| .data | A `data.frame` or `tbl` object that contains the text to be analyzed |\n| col | The name of the field to analyze, supports `tidy-eval` |\n| max_words | The maximum number of words that the LLM should use in the summary. Defaults to 10. |\n| pred_name | A character vector with the name of the new column where the prediction will be placed |\n| additional_prompt | Inserts this text into the prompt sent to the LLM |\n| x | A vector that contains the text to be analyzed |\n| preview | It returns the R call that would have been used to run the prediction. It only returns the first record in `x`. Defaults to `FALSE` Applies to vector function only. |\n\n\n\n## Value\n `llm_summarize` returns a `data.frame` or `tbl` object. `llm_vec_summarize` returns a vector that is the same length as `x`. \n\n\n## Examples\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \n# Use max_words to set the maximum number of words to use for the summary \nllm_summarize(reviews, review, max_words = 5) \n#> # A tibble: 3 × 2\n#> review .summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… it's a great tv \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n \n# Use 'pred_name' to customize the new column's name \nllm_summarize(reviews, review, 5, pred_name = \"review_summary\") \n#> # A tibble: 3 × 2\n#> review review_summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… it's a great tv \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n \n# For character vectors, instead of a data frame, use this function \nllm_vec_summarize( \n \"This has been the best TV I've ever used. Great screen, and sound.\", \n max_words = 5 \n) \n#> [1] \"it's a great tv\"\n \n# To preview the first call that will be made to the downstream R function \nllm_vec_summarize( \n \"This has been the best TV I've ever used. Great screen, and sound.\", \n max_words = 5, \n preview = TRUE \n) \n#> ollamar::chat(messages = list(list(role = \"user\", content = \"You are a helpful summarization engine. Your answer will contain no no capitalization and no explanations. Return no more than 5 words. The answer is the summary of the following text:\\nThis has been the best TV I've ever used. Great screen, and sound.\")), \n#> output = \"text\", model = \"llama3.2\", seed = 100)\n```\n:::\n", + "markdown": "---\ntitle: \"Summarize text\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n[R/llm-summarize.R](https://github.com/edgararuiz/mall/blob/main/R/llm-summarize.R)\n\n## llm_summarize\n\n## Description\n Use a Large Language Model (LLM) to summarize text \n\n\n## Usage\n```r\n \nllm_summarize( \n .data, \n col, \n max_words = 10, \n pred_name = \".summary\", \n additional_prompt = \"\" \n) \n \nllm_vec_summarize(x, max_words = 10, additional_prompt = \"\", preview = FALSE) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| .data | A `data.frame` or `tbl` object that contains the text to be analyzed |\n| col | The name of the field to analyze, supports `tidy-eval` |\n| max_words | The maximum number of words that the LLM should use in the summary. Defaults to 10. |\n| pred_name | A character vector with the name of the new column where the prediction will be placed |\n| additional_prompt | Inserts this text into the prompt sent to the LLM |\n| x | A vector that contains the text to be analyzed |\n| preview | It returns the R call that would have been used to run the prediction. It only returns the first record in `x`. Defaults to `FALSE` Applies to vector function only. |\n\n\n\n## Value\n `llm_summarize` returns a `data.frame` or `tbl` object. `llm_vec_summarize` returns a vector that is the same length as `x`. \n\n\n## Examples\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \n# Use max_words to set the maximum number of words to use for the summary \nllm_summarize(reviews, review, max_words = 5) \n#> # A tibble: 3 × 2\n#> review .summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… it's a great tv \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n \n# Use 'pred_name' to customize the new column's name \nllm_summarize(reviews, review, 5, pred_name = \"review_summary\") \n#> # A tibble: 3 × 2\n#> review review_summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… it's a great tv \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n \n# For character vectors, instead of a data frame, use this function \nllm_vec_summarize( \n \"This has been the best TV I've ever used. Great screen, and sound.\", \n max_words = 5 \n) \n#> [1] \"it's a great tv\"\n \n# To preview the first call that will be made to the downstream R function \nllm_vec_summarize( \n \"This has been the best TV I've ever used. Great screen, and sound.\", \n max_words = 5, \n preview = TRUE \n) \n#> ollamar::chat(messages = list(list(role = \"user\", content = \"You are a helpful summarization engine. Your answer will contain no no capitalization and no explanations. Return no more than 5 words. The answer is the summary of the following text:\\nThis has been the best TV I've ever used. Great screen, and sound.\")), \n#> output = \"text\", model = \"llama3.2\", seed = 100)\n```\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/reference/llm_translate/execute-results/html.json b/_freeze/reference/llm_translate/execute-results/html.json index fd5b557..c004058 100644 --- a/_freeze/reference/llm_translate/execute-results/html.json +++ b/_freeze/reference/llm_translate/execute-results/html.json @@ -2,7 +2,7 @@ "hash": "446270788110e4132cda33c384ad9125", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Translates text to a specific language\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n[R/llm-translate.R](https://github.com/edgararuiz/mall/blob/main/R/llm-translate.R)\n\n## llm_translate\n\n## Description\n Use a Large Language Model (LLM) to translate a text to a specific language \n\n\n## Usage\n```r\n \nllm_translate( \n .data, \n col, \n language, \n pred_name = \".translation\", \n additional_prompt = \"\" \n) \n \nllm_vec_translate(x, language, additional_prompt = \"\", preview = FALSE) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| .data | A `data.frame` or `tbl` object that contains the text to be analyzed |\n| col | The name of the field to analyze, supports `tidy-eval` |\n| language | Target language to translate the text to |\n| pred_name | A character vector with the name of the new column where the prediction will be placed |\n| additional_prompt | Inserts this text into the prompt sent to the LLM |\n| x | A vector that contains the text to be analyzed |\n| preview | It returns the R call that would have been used to run the prediction. It only returns the first record in `x`. Defaults to `FALSE` Applies to vector function only. |\n\n\n\n## Value\n `llm_translate` returns a `data.frame` or `tbl` object. `llm_vec_translate` returns a vector that is the same length as `x`. \n\n\n## Examples\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \n# Pass the desired language to translate to \nllm_translate(reviews, review, \"spanish\") \n#> # A tibble: 3 × 2\n#> review .translation \n#> \n#> 1 This has been the best TV I've ever used. Gr… Esta ha sido la mejor televisió…\n#> 2 I regret buying this laptop. It is too slow … Me arrepiento de comprar este p…\n#> 3 Not sure how to feel about my new washing ma… No estoy seguro de cómo me sien…\n```\n:::\n", + "markdown": "---\ntitle: \"Translates text to a specific language\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n[R/llm-translate.R](https://github.com/edgararuiz/mall/blob/main/R/llm-translate.R)\n\n## llm_translate\n\n## Description\n Use a Large Language Model (LLM) to translate a text to a specific language \n\n\n## Usage\n```r\n \nllm_translate( \n .data, \n col, \n language, \n pred_name = \".translation\", \n additional_prompt = \"\" \n) \n \nllm_vec_translate(x, language, additional_prompt = \"\", preview = FALSE) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| .data | A `data.frame` or `tbl` object that contains the text to be analyzed |\n| col | The name of the field to analyze, supports `tidy-eval` |\n| language | Target language to translate the text to |\n| pred_name | A character vector with the name of the new column where the prediction will be placed |\n| additional_prompt | Inserts this text into the prompt sent to the LLM |\n| x | A vector that contains the text to be analyzed |\n| preview | It returns the R call that would have been used to run the prediction. It only returns the first record in `x`. Defaults to `FALSE` Applies to vector function only. |\n\n\n\n## Value\n `llm_translate` returns a `data.frame` or `tbl` object. `llm_vec_translate` returns a vector that is the same length as `x`. \n\n\n## Examples\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \ndata(\"reviews\") \n \nllm_use(\"ollama\", \"llama3.2\", seed = 100, .silent = TRUE) \n \n# Pass the desired language to translate to \nllm_translate(reviews, review, \"spanish\") \n#> # A tibble: 3 × 2\n#> review .translation \n#> \n#> 1 This has been the best TV I've ever used. Gr… Esta ha sido la mejor televisió…\n#> 2 I regret buying this laptop. It is too slow … Me arrepiento de comprar este p…\n#> 3 Not sure how to feel about my new washing ma… No estoy seguro de cómo me sien…\n```\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/reference/llm_use/execute-results/html.json b/_freeze/reference/llm_use/execute-results/html.json index 71920e3..809195a 100644 --- a/_freeze/reference/llm_use/execute-results/html.json +++ b/_freeze/reference/llm_use/execute-results/html.json @@ -2,7 +2,7 @@ "hash": "84eedf7eec066709f406e09aee9d91c6", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Specify the model to use\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n[R/llm-use.R](https://github.com/edgararuiz/mall/blob/main/R/llm-use.R)\n\n## llm_use\n\n## Description\n Allows us to specify the back-end provider, model to use during the current R session \n\n\n## Usage\n```r\n \nllm_use( \n backend = NULL, \n model = NULL, \n ..., \n .silent = FALSE, \n .cache = NULL, \n .force = FALSE \n) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| backend | The name of an supported back-end provider. Currently only 'ollama' is supported. |\n| model | The name of model supported by the back-end provider |\n| ... | Additional arguments that this function will pass down to the integrating function. In the case of Ollama, it will pass those arguments to `ollamar::chat()`. |\n| .silent | Avoids console output |\n| .cache | The path to save model results, so they can be re-used if the same operation is ran again. To turn off, set this argument to an empty character: `\"\"`. 'It defaults to '_mall_cache'. If this argument is left `NULL` when calling this function, no changes to the path will be made. |\n| .force | Flag that tell the function to reset all of the settings in the R session |\n\n\n\n## Value\n A `mall_session` object \n\n\n## Examples\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \nllm_use(\"ollama\", \"llama3.2\") \n#> \n#> ── mall session object\n#> Backend: ollama\n#> LLM session: model:llama3.2\n#> R session: cache_folder:_mall_cache\n \n# Additional arguments will be passed 'as-is' to the \n# downstream R function in this example, to ollama::chat() \nllm_use(\"ollama\", \"llama3.2\", seed = 100, temp = 0.1) \n#> \n#> ── mall session object \n#> Backend: ollamaLLM session: model:llama3.2\n#> seed:100\n#> temp:0.1\n#> R session: cache_folder:_mall_cache\n \n# During the R session, you can change any argument \n# individually and it will retain all of previous \n# arguments used \nllm_use(temp = 0.3) \n#> \n#> ── mall session object \n#> Backend: ollamaLLM session: model:llama3.2\n#> seed:100\n#> temp:0.3\n#> R session: cache_folder:_mall_cache\n \n# Use .cache to modify the target folder for caching \nllm_use(.cache = \"_my_cache\") \n#> \n#> ── mall session object \n#> Backend: ollamaLLM session: model:llama3.2\n#> seed:100\n#> temp:0.3\n#> R session: cache_folder:_my_cache\n \n# Leave .cache empty to turn off this functionality \nllm_use(.cache = \"\") \n#> \n#> ── mall session object \n#> Backend: ollamaLLM session: model:llama3.2\n#> seed:100\n#> temp:0.3\n \n# Use .silent to avoid the print out \nllm_use(.silent = TRUE) \n \n```\n:::\n", + "markdown": "---\ntitle: \"Specify the model to use\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n[R/llm-use.R](https://github.com/edgararuiz/mall/blob/main/R/llm-use.R)\n\n## llm_use\n\n## Description\n Allows us to specify the back-end provider, model to use during the current R session \n\n\n## Usage\n```r\n \nllm_use( \n backend = NULL, \n model = NULL, \n ..., \n .silent = FALSE, \n .cache = NULL, \n .force = FALSE \n) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| backend | The name of an supported back-end provider. Currently only 'ollama' is supported. |\n| model | The name of model supported by the back-end provider |\n| ... | Additional arguments that this function will pass down to the integrating function. In the case of Ollama, it will pass those arguments to `ollamar::chat()`. |\n| .silent | Avoids console output |\n| .cache | The path to save model results, so they can be re-used if the same operation is ran again. To turn off, set this argument to an empty character: `\"\"`. 'It defaults to '_mall_cache'. If this argument is left `NULL` when calling this function, no changes to the path will be made. |\n| .force | Flag that tell the function to reset all of the settings in the R session |\n\n\n\n## Value\n A `mall_session` object \n\n\n## Examples\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \n \nllm_use(\"ollama\", \"llama3.2\") \n#> \n#> ── mall session object\n#> Backend: ollama\n#> LLM session: model:llama3.2\n#> R session: cache_folder:_mall_cache\n \n# Additional arguments will be passed 'as-is' to the \n# downstream R function in this example, to ollama::chat() \nllm_use(\"ollama\", \"llama3.2\", seed = 100, temp = 0.1) \n#> \n#> ── mall session object \n#> Backend: ollamaLLM session: model:llama3.2\n#> seed:100\n#> temp:0.1\n#> R session: cache_folder:_mall_cache\n \n# During the R session, you can change any argument \n# individually and it will retain all of previous \n# arguments used \nllm_use(temp = 0.3) \n#> \n#> ── mall session object \n#> Backend: ollamaLLM session: model:llama3.2\n#> seed:100\n#> temp:0.3\n#> R session: cache_folder:_mall_cache\n \n# Use .cache to modify the target folder for caching \nllm_use(.cache = \"_my_cache\") \n#> \n#> ── mall session object \n#> Backend: ollamaLLM session: model:llama3.2\n#> seed:100\n#> temp:0.3\n#> R session: cache_folder:_my_cache\n \n# Leave .cache empty to turn off this functionality \nllm_use(.cache = \"\") \n#> \n#> ── mall session object \n#> Backend: ollamaLLM session: model:llama3.2\n#> seed:100\n#> temp:0.3\n \n# Use .silent to avoid the print out \nllm_use(.silent = TRUE) \n \n```\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/reference/m_backend_submit/execute-results/html.json b/_freeze/reference/m_backend_submit/execute-results/html.json index 80e9b0d..cda5d01 100644 --- a/_freeze/reference/m_backend_submit/execute-results/html.json +++ b/_freeze/reference/m_backend_submit/execute-results/html.json @@ -2,7 +2,7 @@ "hash": "ae407edd991c3bb6d06e7b77c3db287a", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Functions to integrate different back-ends\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n[R/m-backend-prompt.R, R/m-backend-submit.R](https://github.com/edgararuiz/mall/blob/main/R/m-backend-prompt.R, R/m-backend-submit.R)\n\n## m_backend_prompt\n\n## Description\n Functions to integrate different back-ends \n\n\n## Usage\n```r\n \nm_backend_prompt(backend, additional) \n \nm_backend_submit(backend, x, prompt, preview = FALSE) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| backend | An `mall_session` object |\n| additional | Additional text to insert to the `base_prompt` |\n| x | The body of the text to be submitted to the LLM |\n| prompt | The additional information to add to the submission |\n| preview | If `TRUE`, it will display the resulting R call of the first text in `x` |\n\n\n\n## Value\n `m_backend_submit` does not return an object. `m_backend_prompt` returns a list of functions that contain the base prompts. \n\n\n\n\n", + "markdown": "---\ntitle: \"Functions to integrate different back-ends\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n[R/m-backend-prompt.R, R/m-backend-submit.R](https://github.com/edgararuiz/mall/blob/main/R/m-backend-prompt.R, R/m-backend-submit.R)\n\n## m_backend_prompt\n\n## Description\n Functions to integrate different back-ends \n\n\n## Usage\n```r\n \nm_backend_prompt(backend, additional) \n \nm_backend_submit(backend, x, prompt, preview = FALSE) \n```\n\n## Arguments\n|Arguments|Description|\n|---|---|\n| backend | An `mall_session` object |\n| additional | Additional text to insert to the `base_prompt` |\n| x | The body of the text to be submitted to the LLM |\n| prompt | The additional information to add to the submission |\n| preview | If `TRUE`, it will display the resulting R call of the first text in `x` |\n\n\n\n## Value\n `m_backend_submit` does not return an object. `m_backend_prompt` returns a list of functions that contain the base prompts. \n\n\n\n\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/reference/reviews/execute-results/html.json b/_freeze/reference/reviews/execute-results/html.json index 96864e7..867c255 100644 --- a/_freeze/reference/reviews/execute-results/html.json +++ b/_freeze/reference/reviews/execute-results/html.json @@ -2,7 +2,7 @@ "hash": "e141f1285e2ad9e09a5a074db4ed673f", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Mini reviews data set\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n[R/data-reviews.R](https://github.com/edgararuiz/mall/blob/main/R/data-reviews.R)\n\n## reviews\n\n## Description\n Mini reviews data set \n\n## Format\n A data frame that contains 3 records. The records are of fictitious product reviews. \n\n## Usage\n```r\n \nreviews \n```\n\n\n\n\n\n\n## Examples\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \ndata(reviews) \nreviews \n#> # A tibble: 3 × 1\n#> review \n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noisy \n#> 3 Not sure how to feel about my new washing machine. Great color, but hard to f…\n```\n:::\n", + "markdown": "---\ntitle: \"Mini reviews data set\"\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n[R/data-reviews.R](https://github.com/edgararuiz/mall/blob/main/R/data-reviews.R)\n\n## reviews\n\n## Description\n Mini reviews data set \n\n## Format\n A data frame that contains 3 records. The records are of fictitious product reviews. \n\n## Usage\n```r\n \nreviews \n```\n\n\n\n\n\n\n## Examples\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n \nlibrary(mall) \ndata(reviews) \nreviews \n#> # A tibble: 3 × 1\n#> review \n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noisy \n#> 3 Not sure how to feel about my new washing machine. Great color, but hard to f…\n```\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_quarto.yml b/_quarto.yml index 4066be6..90c5d59 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -52,7 +52,7 @@ quartodoc: options: null style: pkgdown dir: reference - out_index: python_index.qmd + out_index: _api_index.qmd dynamic: true sections: - title: mall diff --git a/reference/python_index.qmd b/reference/_api_index.qmd similarity index 53% rename from reference/python_index.qmd rename to reference/_api_index.qmd index 2bca752..901fdf5 100644 --- a/reference/python_index.qmd +++ b/reference/_api_index.qmd @@ -1,7 +1,7 @@ ---- -toc: false ---- - +# Function reference {.doc .doc-index} + +## mall + | | | diff --git a/reference/index.qmd b/reference/index.qmd index 0044f9d..49219cd 100644 --- a/reference/index.qmd +++ b/reference/index.qmd @@ -3,7 +3,13 @@ {{< include r_index.qmd >}} ## python -{{< include python_index.qmd >}} + + + +[MallFrame](MallFrame.qmd#mall.MallFrame) + +      Extension to Polars that add ability to use +an LLM to run batch predictions over a data frame ::: \ No newline at end of file diff --git a/reference/r_index.qmd b/reference/r_index.qmd index 3c5c731..761cf58 100644 --- a/reference/r_index.qmd +++ b/reference/r_index.qmd @@ -1,9 +1,7 @@ --- toc: false --- - - -# Function Reference + [llm_classify()](llm_classify.html) [llm_vec_classify()](llm_classify.html) diff --git a/utils/website/README.md b/utils/website/README.md index a0c5076..6b07432 100644 --- a/utils/website/README.md +++ b/utils/website/README.md @@ -2,8 +2,8 @@ ```r devtools::install(upgrade = "never") -try(fs::dir_delete("_freeze/reference/")) -source("utils/website/build_reference.R") +#try(fs::dir_delete("_freeze/reference/")) +source(here::here("utils/website/build_reference.R")) quarto::quarto_render(as_job = FALSE) quarto::quarto_preview() ``` diff --git a/utils/website/build_reference.R b/utils/website/build_reference.R index 674d8ce..afb7fb5 100644 --- a/utils/website/build_reference.R +++ b/utils/website/build_reference.R @@ -8,9 +8,9 @@ library(cli) build_reference_index <- function(pkg = ".", folder = "reference") { if (is.character(pkg)) pkg <- pkgdown::as_pkgdown(pkg) try(dir_create(folder)) - ref_path <- path(folder, "index", ext = "qmd") + ref_path <- path(folder, "r_index", ext = "qmd") try(file_delete(ref_path)) - writeLines(reference_index(), ref_path) + writeLines(reference_index(pkg), ref_path) cli_inform(col_green(ref_path)) } @@ -29,5 +29,5 @@ build_reference <- function(pkg = ".", folder = "reference") { ) } -build_reference_index() -build_reference() +build_reference_index("r") +build_reference("r") diff --git a/utils/website/index-page.R b/utils/website/index-page.R index d080733..7af798b 100644 --- a/utils/website/index-page.R +++ b/utils/website/index-page.R @@ -17,9 +17,7 @@ reference_index <- function(pkg = ".") { "---", "toc: false", "---", - "", - "", - "# Function Reference", + "", "", res ) From dde2375604c2a292fc6511369c08cf70a8662d5e Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sun, 6 Oct 2024 14:14:34 -0500 Subject: [PATCH 41/57] Re-writes intro --- index.qmd | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/index.qmd b/index.qmd index 7cded57..4a551ac 100644 --- a/index.qmd +++ b/index.qmd @@ -10,7 +10,7 @@ execute: ```{r} #| include: false -#| eval: true + library(dplyr) library(dbplyr) @@ -26,7 +26,9 @@ mall::llm_use("ollama", "llama3.2", seed = 100, .cache = "_readme_cache") Run multiple LLM predictions against a data frame. The predictions are processed row-wise over a specified column. It works using a pre-determined one-shot prompt, along with the current row's content. The prompt that is use will depend of the -type of analysis needed. Currently, the included prompts perform the following: +type of analysis needed. + +Currently, the included prompts perform the following: - [Sentiment analysis](#sentiment) - [Text summarizing](#summarize) @@ -37,9 +39,26 @@ type of analysis needed. Currently, the included prompts perform the following: This package is inspired by the SQL AI functions now offered by vendors such as [Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) -and Snowflake. `mall` uses [Ollama](https://ollama.com/) to -interact with LLMs installed locally. That interaction takes place via the -[`ollamar`](https://hauselin.github.io/ollama-r/) package. +and Snowflake. `mall` uses [Ollama](https://ollama.com/) to interact with LLMs +installed locally. + + +For R, that interaction takes place via the +[`ollamar`](https://hauselin.github.io/ollama-r/) package. The functions are +designed to easily work with piped commands, such as `dplyr`. + +```r +reviews |> + llm_sentiment(review) +``` + + +For Python, `mall` includes an extension to [Polars](https://pola.rs/). To +interact with Ollama, it uses the official [Python library](https://github.com/ollama/ollama-python). + +```python +reviews.llm.sentiment("review") +``` ## Motivation From b81223b658d2ba1278307f67268e20d636436df7 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sun, 6 Oct 2024 14:31:07 -0500 Subject: [PATCH 42/57] Expands Motivation --- index.qmd | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/index.qmd b/index.qmd index 4a551ac..cf72d5b 100644 --- a/index.qmd +++ b/index.qmd @@ -64,17 +64,17 @@ reviews.llm.sentiment("review") We want to new find ways to help data scientists use LLMs in their daily work. Unlike the familiar interfaces, such as chatting and code completion, this interface -runs your text data directly against the LLM. The LLM's flexibility, allows for -it to adapt to the subject of your data, and provide surprisingly accurate predictions. -This saves the data scientist the need to write and tune an NLP model. +runs your text data directly against the LLM. -```{r} -#| include: false +The LLM's flexibility, allows for it to adapt to the subject of your data, and +provide surprisingly accurate predictions. This saves the data scientist the +need to write and tune an NLP model. -# Add paragraph about: thanks to the more widespread availability of capable -# local llms, data does not leave your company, no $$ cost to use - -``` +In recent times, the capabilities of LLMs that can run locally in your computer +have increased dramatically. This means that these sort of analysis can run +in your machine with good accuracy. Additionally, it makes it possible to take +advantage of LLM's at your institution, since the data will not leave the +corporate network. ## Get started From 133c1d10c601687dcc03970d0b32f1070cb3123a Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sun, 6 Oct 2024 16:20:26 -0500 Subject: [PATCH 43/57] Completes get started section --- index.qmd | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/index.qmd b/index.qmd index cf72d5b..67682f6 100644 --- a/index.qmd +++ b/index.qmd @@ -96,6 +96,13 @@ pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdir ### With local LLMs +- [Download Ollama from the official website](https://ollama.com/download) + +- Install and start Ollama in your computer + + +::: {.panel-tabset group="language"} +## R - Install Ollama in your machine. The `ollamar` package's website provides this [Installation guide](https://hauselin.github.io/ollama-r/#installation) @@ -104,8 +111,25 @@ Llama 3.2 to test. To get that model you can run: ```r ollamar::pull("llama3.2") ``` + +## python + +- Install the official Ollama library + ```python + pip install ollama + ``` + +- Download an LLM model. For example, I have been developing this package using +Llama 3.2 to test. To get that model you can run: + ```python + import ollama + ollama.pull('llama3.2') + ``` +::: -### With Databricks + + +### With Databricks (R only) If you pass a table connected to **Databricks** via `odbc`, `mall` will automatically use Databricks' LLM instead of Ollama. *You won't need Ollama From 2dd36d56f8c82dd228f9bf49764bd9b14eec1b37 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sun, 6 Oct 2024 16:34:58 -0500 Subject: [PATCH 44/57] Adds MallData, updates index.qmd --- index.qmd | 18 +++++------------- python/mall/__init__.py | 3 ++- python/mall/data.py | 12 ++++++++++++ python/mall/polars.py | 8 ++++---- 4 files changed, 23 insertions(+), 18 deletions(-) create mode 100644 python/mall/data.py diff --git a/index.qmd b/index.qmd index 67682f6..3555e93 100644 --- a/index.qmd +++ b/index.qmd @@ -152,8 +152,6 @@ We will start with loading a very small data set contained in `mall`. It has ## R ```{r} - - library(mall) data("reviews") @@ -164,20 +162,14 @@ reviews ## python ```{python} - - - +#| eval: true import mall import polars as pl -reviews = pl.DataFrame( - data=[ - "This has been the best TV I've ever used. Great screen, and sound.", - "I regret buying this laptop. It is too slow and the keyboard is too noisy", - "Not sure how to feel about my new washing machine. Great color, but hard to figure" - ], - schema=[("review", pl.String)], -) +data = mall.MallData + +reviews = data.reviews +reviews ``` ::: diff --git a/python/mall/__init__.py b/python/mall/__init__.py index 4125a70..e623439 100644 --- a/python/mall/__init__.py +++ b/python/mall/__init__.py @@ -1,3 +1,4 @@ -__all__ = ["MallFrame"] +__all__ = ["MallFrame", "MallData"] from mall.polars import MallFrame +from mall.data import MallData diff --git a/python/mall/data.py b/python/mall/data.py new file mode 100644 index 0000000..7d78cb7 --- /dev/null +++ b/python/mall/data.py @@ -0,0 +1,12 @@ +import polars as pl + +class MallData: + reviews = pl.DataFrame( + data=[ + "This has been the best TV I've ever used. Great screen, and sound.", + "I regret buying this laptop. It is too slow and the keyboard is too noisy", + "Not sure how to feel about my new washing machine. Great color, but hard to figure" + ], + schema=[("review", pl.String)], + ) + diff --git a/python/mall/polars.py b/python/mall/polars.py index 359c9c6..988390d 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -63,17 +63,17 @@ def translate( Parameters ------ - col + col: str The name of the text field to process - language + language: str The target language to translate to. For example 'French'. - pred_name + pred_name: str A character vector with the name of the new column where the prediction will be placed - additional + additional: str Inserts this text into the prompt sent to the LLM """ df = map_call( From 1219a93ca37ee2313210d3f756e6ea155106b521 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Sun, 6 Oct 2024 17:04:22 -0500 Subject: [PATCH 45/57] Adds support for named options --- python/mall/prompt.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/python/mall/prompt.py b/python/mall/prompt.py index 878fc0a..0004f62 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -6,7 +6,14 @@ def process_labels(x, if_list="", if_dict=""): out = out.strip() out = out.replace(" ", ", ") out = if_list.replace("{values}", out) - return out + if isinstance(x, dict): + out = "" + for i in x: + new = if_dict + new = new.replace("{key}", i) + new = new.replace("{value}", x.get(i)) + out += " " + new + return out def sentiment(options, additional=""): @@ -75,3 +82,5 @@ def classify(labels, additional=""): } ] return msg + + From 4bb18c6fbd2b0a5874ea5157cfaa5ead9eac25b5 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 7 Oct 2024 07:28:11 -0500 Subject: [PATCH 46/57] Adds extract --- python/mall/data.py | 8 ++++---- python/mall/polars.py | 29 +++++++++++++++++++++++------ python/mall/prompt.py | 30 +++++++++++++++++++++++++++++- 3 files changed, 56 insertions(+), 11 deletions(-) diff --git a/python/mall/data.py b/python/mall/data.py index 7d78cb7..73b7b23 100644 --- a/python/mall/data.py +++ b/python/mall/data.py @@ -1,12 +1,12 @@ import polars as pl + class MallData: reviews = pl.DataFrame( data=[ - "This has been the best TV I've ever used. Great screen, and sound.", + "This has been the best TV I've ever used. Great screen, and sound.", "I regret buying this laptop. It is too slow and the keyboard is too noisy", - "Not sure how to feel about my new washing machine. Great color, but hard to figure" - ], + "Not sure how to feel about my new washing machine. Great color, but hard to figure", + ], schema=[("review", pl.String)], ) - diff --git a/python/mall/polars.py b/python/mall/polars.py index 988390d..8d12ab7 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -1,13 +1,14 @@ import polars as pl -from mall.prompt import sentiment, summarize, translate, classify +from mall.prompt import sentiment, summarize, translate, classify, extract from mall.llm import llm_call @pl.api.register_dataframe_namespace("llm") class MallFrame: - """Extension to Polars that add ability to use - an LLM to run batch predictions over a data frame + """Extension to Polars that add ability to use + an LLM to run batch predictions over a data frame """ + def __init__(self, df: pl.DataFrame) -> None: self._df = df self._use = {"backend": "ollama", "model": "llama3.2"} @@ -67,15 +68,15 @@ def translate( The name of the text field to process language: str - The target language to translate to. For example 'French'. + The target language to translate to. For example 'French'. pred_name: str A character vector with the name of the new column where the prediction will be placed - + additional: str Inserts this text into the prompt sent to the LLM - """ + """ df = map_call( df=self._df, col=col, @@ -101,6 +102,22 @@ def classify( ) return df + def extract( + self, + col, + labels="", + additional="", + pred_name="extract", + ) -> list[pl.DataFrame]: + df = map_call( + df=self._df, + col=col, + msg=extract(labels, additional=additional), + pred_name=pred_name, + use=self._use, + ) + return df + def map_call(df, col, msg, pred_name, use): df = df.with_columns( diff --git a/python/mall/prompt.py b/python/mall/prompt.py index 0004f62..8b209a8 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -13,7 +13,7 @@ def process_labels(x, if_list="", if_dict=""): new = new.replace("{key}", i) new = new.replace("{value}", x.get(i)) out += " " + new - return out + return out def sentiment(options, additional=""): @@ -84,3 +84,31 @@ def classify(labels, additional=""): return msg +def extract(labels, additional=""): + no_labels = len(labels) + col_labels = "" + for label in labels: + col_labels += label + " " + col_labels = col_labels.rstrip() + col_labels = col_labels.replace(" ", ", ") + if no_labels > 1: + plural = "s" + text_multi = ( + "Return the response in a simple list, pipe separated, and no headers. " + ) + else: + plural = "" + text_multi = "" + msg = [ + { + "role": "user", + "content": "You are a helpful text extraction engine." + + f"Extract the {col_labels} being referred to on the text." + + f"I expect {no_labels} item{plural} exactly." + + "No capitalization. No explanations." + + f"{text_multi}" + + f"{additional}" + + "The answer is based on the following text:\n{}", + } + ] + return msg From 0b7237e5207c2f91eb2c3d7f44e348ce197df1f5 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 7 Oct 2024 07:46:03 -0500 Subject: [PATCH 47/57] Fixes label issue in extract --- python/README.qmd | 12 ++++-------- python/mall/prompt.py | 15 +++++++++------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/python/README.qmd b/python/README.qmd index af8b3ed..c189c5b 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -31,14 +31,10 @@ html_formatter.for_type(pl.DataFrame, lambda df: "\n".join(HTMLFormatter(df).ren import mall import polars as pl -reviews = pl.DataFrame( - data=[ - "This has been the best TV I've ever used. Great screen, and sound.", - "I regret buying this laptop. It is too slow and the keyboard is too noisy", - "Not sure how to feel about my new washing machine. Great color, but hard to figure" - ], - schema=[("review", pl.String)], -) +data = mall.MallData + +reviews = data.reviews + ``` ```{python} diff --git a/python/mall/prompt.py b/python/mall/prompt.py index 8b209a8..0f5e91e 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -85,20 +85,23 @@ def classify(labels, additional=""): def extract(labels, additional=""): - no_labels = len(labels) col_labels = "" - for label in labels: - col_labels += label + " " - col_labels = col_labels.rstrip() - col_labels = col_labels.replace(" ", ", ") - if no_labels > 1: + if isinstance(labels, list): + no_labels = len(labels) plural = "s" text_multi = ( "Return the response in a simple list, pipe separated, and no headers. " ) + for label in labels: + col_labels += label + " " + col_labels = col_labels.rstrip() + col_labels = col_labels.replace(" ", ", ") else: + no_labels = 1 plural = "" text_multi = "" + col_labels = labels + msg = [ { "role": "user", From 396a9c81c218c88fd56c5a1a5a16cf282621ca51 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 7 Oct 2024 08:25:06 -0500 Subject: [PATCH 48/57] Adds custom --- python/mall/polars.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/mall/polars.py b/python/mall/polars.py index 8d12ab7..5c5027c 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -118,6 +118,22 @@ def extract( ) return df + def custom( + self, + col, + prompt="", + valid_resps="", + pred_name="custom", + ) -> list[pl.DataFrame]: + df = map_call( + df=self._df, + col=col, + msg=prompt, + pred_name=pred_name, + use=self._use, + ) + return df + def map_call(df, col, msg, pred_name, use): df = df.with_columns( From be46aeb2f6493ae5d93fb7ca95c95f303fe7459b Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 7 Oct 2024 09:32:54 -0500 Subject: [PATCH 49/57] Adds support for valid_resps, fixes custom --- python/README.qmd | 3 --- python/mall/llm.py | 21 +++++++++++++-------- python/mall/polars.py | 12 ++++++++---- python/mall/prompt.py | 5 +++++ 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/python/README.qmd b/python/README.qmd index c189c5b..862f56a 100644 --- a/python/README.qmd +++ b/python/README.qmd @@ -30,11 +30,8 @@ html_formatter.for_type(pl.DataFrame, lambda df: "\n".join(HTMLFormatter(df).ren ```{python} import mall import polars as pl - data = mall.MallData - reviews = data.reviews - ``` ```{python} diff --git a/python/mall/llm.py b/python/mall/llm.py index a84d23c..4d7b2a0 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -8,18 +8,23 @@ def build_msg(x, msg): return out -def llm_call(x, msg, use, preview=True): - # print( - # dict( - # model=use.get("model"), - # messages=build_msg(x, msg), - # options=use.get("options"), - # ) - # ) +def llm_call(x, msg, use, preview=True, valid_resps=""): resp = ollama.chat( model=use.get("model"), messages=build_msg(x, msg), options=use.get("options"), ) out = resp["message"]["content"] + if isinstance(valid_resps, list): + if out not in valid_resps: + out = None return out + + +# print( +# dict( +# model=use.get("model"), +# messages=build_msg(x, msg), +# options=use.get("options"), +# ) +# ) diff --git a/python/mall/polars.py b/python/mall/polars.py index 5c5027c..b672c40 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -1,5 +1,5 @@ import polars as pl -from mall.prompt import sentiment, summarize, translate, classify, extract +from mall.prompt import sentiment, summarize, translate, classify, extract, custom from mall.llm import llm_call @@ -34,6 +34,7 @@ def sentiment( msg=sentiment(options, additional=additional), pred_name=pred_name, use=self._use, + valid_resps=options, ) return df @@ -99,6 +100,7 @@ def classify( msg=classify(labels, additional=additional), pred_name=pred_name, use=self._use, + valid_resps=labels, ) return df @@ -115,6 +117,7 @@ def extract( msg=extract(labels, additional=additional), pred_name=pred_name, use=self._use, + valid_resps=labels, ) return df @@ -128,18 +131,19 @@ def custom( df = map_call( df=self._df, col=col, - msg=prompt, + msg=custom(prompt), pred_name=pred_name, use=self._use, + valid_resps=valid_resps, ) return df -def map_call(df, col, msg, pred_name, use): +def map_call(df, col, msg, pred_name, use, valid_resps=""): df = df.with_columns( pl.col(col) .map_elements( - lambda x: llm_call(x, msg, use), + lambda x: llm_call(x, msg, use, False, valid_resps), return_dtype=pl.String, ) .alias(pred_name) diff --git a/python/mall/prompt.py b/python/mall/prompt.py index 0f5e91e..cd7510a 100644 --- a/python/mall/prompt.py +++ b/python/mall/prompt.py @@ -115,3 +115,8 @@ def extract(labels, additional=""): } ] return msg + + +def custom(prompt): + msg = [{"role": "user", "content": f"{prompt}" + ": \n{}"}] + return msg From 54ab6c4afb619874d126ec1299a07218240914ae Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 7 Oct 2024 10:01:44 -0500 Subject: [PATCH 50/57] Adds build_hash --- python/mall/llm.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index 4d7b2a0..8788f69 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -1,4 +1,6 @@ import ollama +import json +import hashlib def build_msg(x, msg): @@ -9,6 +11,16 @@ def build_msg(x, msg): def llm_call(x, msg, use, preview=True, valid_resps=""): + + call = dict( + model=use.get("model"), + messages=build_msg(x, msg), + options=use.get("options"), + ) + + if preview: + print(call) + resp = ollama.chat( model=use.get("model"), messages=build_msg(x, msg), @@ -21,10 +33,9 @@ def llm_call(x, msg, use, preview=True, valid_resps=""): return out -# print( -# dict( -# model=use.get("model"), -# messages=build_msg(x, msg), -# options=use.get("options"), -# ) -# ) +def build_hash(x): + if isinstance(x, dict): + x = json.dumps(x) + x_sha = hashlib.sha1(x.encode("utf-8")) + x_digest = x_sha.hexdigest() + return x_digest From 65f011b38c602649f214e4e682b61fce7dc5eb30 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 7 Oct 2024 10:44:55 -0500 Subject: [PATCH 51/57] Adds _cache to use --- python/mall/polars.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/python/mall/polars.py b/python/mall/polars.py index b672c40..e7aff5b 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -11,13 +11,18 @@ class MallFrame: def __init__(self, df: pl.DataFrame) -> None: self._df = df - self._use = {"backend": "ollama", "model": "llama3.2"} + self._use = dict( + backend = "ollama", + model = "llama3.2", + _cache = "_mall_cache" + ) - def use(self, backend="", model="", **kwargs): + def use(self, backend="", model="", _cache = "_mall_cache", **kwargs): if backend != "": - self._use = {"backend": backend, "model": self._use["model"]} + self._use.update(dict(backend = backend)) if model != "": - self._use = {"backend": self._use["backend"], "model": model} + self._use.update(dict(model = model)) + self._use.update(dict(_cache = _cache)) self._use.update(dict(kwargs)) return self._use From a564a2470c5d3c7d86b0d24425b1dab59650011e Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 7 Oct 2024 12:30:39 -0500 Subject: [PATCH 52/57] Adds support for cache --- python/mall/llm.py | 57 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index 8788f69..dc89ee3 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -1,6 +1,7 @@ import ollama import json import hashlib +import os def build_msg(x, msg): @@ -10,7 +11,7 @@ def build_msg(x, msg): return out -def llm_call(x, msg, use, preview=True, valid_resps=""): +def llm_call(x, msg, use, preview=False, valid_resps=""): call = dict( model=use.get("model"), @@ -21,12 +22,25 @@ def llm_call(x, msg, use, preview=True, valid_resps=""): if preview: print(call) - resp = ollama.chat( - model=use.get("model"), - messages=build_msg(x, msg), - options=use.get("options"), - ) - out = resp["message"]["content"] + cache = "" + if use.get("_cache") != "": + hash_call = build_hash(call) + cache = cache_check(hash_call, use) + + if cache == "": + resp = ollama.chat( + model=use.get("model"), + messages=build_msg(x, msg), + options=use.get("options"), + ) + out = resp["message"]["content"] + else: + out = cache + + if use.get("_cache") != "": + if cache == "": + cache_record(hash_call, use, call, out) + if isinstance(valid_resps, list): if out not in valid_resps: out = None @@ -39,3 +53,32 @@ def build_hash(x): x_sha = hashlib.sha1(x.encode("utf-8")) x_digest = x_sha.hexdigest() return x_digest + + +def cache_check(hash_call, use): + file_path = cache_path(hash_call, use) + if os.path.isfile(file_path): + file_connection = open(file_path) + file_read = file_connection.read() + file_parse = json.loads(file_read) + out = file_parse.get("response") + else: + out = "" + return out + + +def cache_record(hash_call, use, call, response): + file_path = cache_path(hash_call, use) + file_folder = os.path.dirname(file_path) + if not os.path.isdir(file_folder): + os.makedirs(file_folder) + contents = dict(request=call, response=response) + json_contents = json.dumps(contents) + with open(file_path, "w") as file: + file.write(json_contents) + + +def cache_path(hash_call, use): + sub_folder = hash_call[0:2] + file_path = use.get("_cache") + "/" + sub_folder + "/" + hash_call + ".json" + return file_path From 1b26c489ae353e7366f532b6a2be96325368c324 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 7 Oct 2024 13:44:43 -0500 Subject: [PATCH 53/57] Enables eval now that we have cache in Python, runs everything with cache, enables freeze --- _freeze/index/execute-results/html.json | 15 +++++++++ index.qmd | 44 ++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 5 deletions(-) create mode 100644 _freeze/index/execute-results/html.json diff --git a/_freeze/index/execute-results/html.json b/_freeze/index/execute-results/html.json new file mode 100644 index 0000000..bf47ab4 --- /dev/null +++ b/_freeze/index/execute-results/html.json @@ -0,0 +1,15 @@ +{ + "hash": "21609c68109605e29ecc8b10d4d012b3", + "result": { + "engine": "knitr", + "markdown": "---\nformat:\n html:\n toc: true\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n\n\nRun multiple LLM predictions against a data frame. The predictions are processed \nrow-wise over a specified column. It works using a pre-determined one-shot prompt,\nalong with the current row's content. The prompt that is use will depend of the\ntype of analysis needed. \n\nCurrently, the included prompts perform the following: \n\n- [Sentiment analysis](#sentiment)\n- [Text summarizing](#summarize)\n- [Classify text](#classify)\n- [Extract one, or several](#extract), specific pieces information from the text\n- [Translate text](#translate)\n- [Custom prompt](#custom-prompt)\n\nThis package is inspired by the SQL AI functions now offered by vendors such as\n[Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) \nand Snowflake. `mall` uses [Ollama](https://ollama.com/) to interact with LLMs \ninstalled locally. \n\n\nFor R, that interaction takes place via the \n[`ollamar`](https://hauselin.github.io/ollama-r/) package. The functions are \ndesigned to easily work with piped commands, such as `dplyr`. \n\n```r\nreviews |>\n llm_sentiment(review)\n```\n\n\nFor Python, `mall` includes an extension to [Polars](https://pola.rs/). To\ninteract with Ollama, it uses the official [Python library](https://github.com/ollama/ollama-python).\n\n```python\nreviews.llm.sentiment(\"review\")\n```\n\n## Motivation\n\nWe want to new find ways to help data scientists use LLMs in their daily work. \nUnlike the familiar interfaces, such as chatting and code completion, this interface\nruns your text data directly against the LLM. \n\nThe LLM's flexibility, allows for it to adapt to the subject of your data, and \nprovide surprisingly accurate predictions. This saves the data scientist the\nneed to write and tune an NLP model. \n\nIn recent times, the capabilities of LLMs that can run locally in your computer \nhave increased dramatically. This means that these sort of analysis can run\nin your machine with good accuracy. Additionally, it makes it possible to take\nadvantage of LLM's at your institution, since the data will not leave the\ncorporate network. \n\n## Get started\n\n- Install `mall` from Github\n\n \n::: {.panel-tabset group=\"language\"}\n## R\n```r\npak::pak(\"edgararuiz/mall/r@python\")\n```\n\n## python\n```python\npip install \"mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python\"\n```\n:::\n \n\n### With local LLMs\n\n- [Download Ollama from the official website](https://ollama.com/download)\n\n- Install and start Ollama in your computer\n\n\n::: {.panel-tabset group=\"language\"}\n## R\n- Install Ollama in your machine. The `ollamar` package's website provides this\n[Installation guide](https://hauselin.github.io/ollama-r/#installation)\n\n- Download an LLM model. For example, I have been developing this package using\nLlama 3.2 to test. To get that model you can run: \n ```r\n ollamar::pull(\"llama3.2\")\n ```\n \n## python\n\n- Install the official Ollama library\n ```python\n pip install ollama\n ```\n\n- Download an LLM model. For example, I have been developing this package using\nLlama 3.2 to test. To get that model you can run: \n ```python\n import ollama\n ollama.pull('llama3.2')\n ```\n:::\n\n\n \n### With Databricks (R only)\n\nIf you pass a table connected to **Databricks** via `odbc`, `mall` will \nautomatically use Databricks' LLM instead of Ollama. *You won't need Ollama \ninstalled if you are using Databricks only.*\n\n`mall` will call the appropriate SQL AI function. For more information see our \n[Databricks article.](https://edgararuiz.github.io/mall/articles/databricks.html) \n\n## LLM functions\n\n### Sentiment\n\nPrimarily, `mall` provides verb-like functions that expect a `tbl` as \ntheir first argument. This allows us to use them in piped operations. \n\nWe will start with loading a very small data set contained in `mall`. It has\n3 product reviews that we will use as the source of our examples.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\n\ndata(\"reviews\")\n\nreviews\n#> # A tibble: 3 × 1\n#> review \n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noisy \n#> 3 Not sure how to feel about my new washing machine. Great color, but hard to f…\n```\n:::\n\n\n\n## python\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nimport mall \nimport polars as pl\n\ndata = mall.MallData\n\nreviews = data.reviews\nreviews \n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 1)
review
str
"This has been the best TV I've…
"I regret buying this laptop. I…
"Not sure how to feel about my …
\n```\n\n:::\n:::\n\n\n:::\n\n\n\n\n\n\n\nFor the first example, we'll asses the sentiment of each review. In order to \ndo this we will call `llm_sentiment()`:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\nreviews |>\n llm_sentiment(review)\n#> # A tibble: 3 × 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… neutral\n```\n:::\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.sentiment(\"review\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewsentiment
strstr
"This has been the best TV I've…"positive"
"I regret buying this laptop. I…"negative"
"Not sure how to feel about my …"neutral"
\n```\n\n:::\n:::\n\n\n\n:::\n\nThe function let's us modify the options to choose from: \n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\nreviews |>\n llm_sentiment(review, options = c(\"positive\", \"negative\"))\n#> # A tibble: 3 × 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… negative\n```\n:::\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.sentiment(\"review\", options=[\"positive\", \"negative\"])\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewsentiment
strstr
"This has been the best TV I've…"positive"
"I regret buying this laptop. I…"negative"
"Not sure how to feel about my …"negative"
\n```\n\n:::\n:::\n\n\n\n:::\n\nAs mentioned before, by being pipe friendly, the results from the LLM prediction\ncan be used in further transformations: \n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\nreviews |>\n llm_sentiment(review, options = c(\"positive\", \"negative\")) |>\n filter(.sentiment == \"negative\")\n#> # A tibble: 2 × 2\n#> review .sentiment\n#> \n#> 1 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 2 Not sure how to feel about my new washing machine. Great color, bu… negative\n```\n:::\n\n\n\n## python\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nx = reviews.llm.sentiment(\"review\", options=[\"positive\", \"negative\"])\n\nx.filter(pl.col(\"sentiment\") == \"negative\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (2, 2)
reviewsentiment
strstr
"I regret buying this laptop. I…"negative"
"Not sure how to feel about my …"negative"
\n```\n\n:::\n:::\n\n\n\n:::\n\n### Summarize\n\nThere may be a need to reduce the number of words in a given text. Usually, to \nmake it easier to capture its intent. To do this, use `llm_summarize()`. This\nfunction has an argument to control the maximum number of words to output \n(`max_words`):\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\n\nreviews |>\n llm_summarize(review, max_words = 5)\n#> # A tibble: 3 × 2\n#> review .summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… it's a great tv \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n```\n:::\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\n\nreviews.llm.summarize(\"review\", 5)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewsummary
strstr
"This has been the best TV I've…"it's a great tv"
"I regret buying this laptop. I…"laptop not worth the money"
"Not sure how to feel about my …"feeling uncertain about new pu…
\n```\n\n:::\n:::\n\n\n\n:::\n\nTo control the name of the prediction field, you can change `pred_name` argument.\nThis works with the other `llm` functions as well. \n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\nreviews |>\n llm_summarize(review, max_words = 5, pred_name = \"review_summary\")\n#> # A tibble: 3 × 2\n#> review review_summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… it's a great tv \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n```\n:::\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.summarize(\"review\", max_words = 5, pred_name = \"review_summary\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewreview_summary
strstr
"This has been the best TV I've…"it's a great tv"
"I regret buying this laptop. I…"laptop not worth the money"
"Not sure how to feel about my …"feeling uncertain about new pu…
\n```\n\n:::\n:::\n\n\n\n:::\n\n### Classify\n\nUse the LLM to categorize the text into one of the options you provide: \n\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_classify(review, c(\"appliance\", \"computer\"))\n#> # A tibble: 3 × 2\n#> review .classify\n#> \n#> 1 This has been the best TV I've ever used. Gr… computer \n#> 2 I regret buying this laptop. It is too slow … computer \n#> 3 Not sure how to feel about my new washing ma… appliance\n```\n:::\n\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.classify(\"review\", [\"computer\", \"appliance\"])\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewclassify
strstr
"This has been the best TV I've…"appliance"
"I regret buying this laptop. I…"appliance"
"Not sure how to feel about my …"appliance"
\n```\n\n:::\n:::\n\n\n\n:::\n\n### Extract \n\nOne of the most interesting operations. Using natural language, we can tell the \nLLM to return a specific part of the text. In the following example, we request\nthat the LLM return the product being referred to. We do this by simply saying \n\"product\". The LLM understands what we *mean* by that word, and looks for that\nin the text.\n\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_extract(review, \"product\")\n#> # A tibble: 3 × 2\n#> review .extract \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n```\n:::\n\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.extract(\"review\", \"product\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewextract
strstr
"This has been the best TV I've…"tv"
"I regret buying this laptop. I…"laptop"
"Not sure how to feel about my …"washing machine"
\n```\n\n:::\n:::\n\n\n\n:::\n\n\n### Translate\n\nAs the title implies, this function will translate the text into a specified \nlanguage. What is really nice, it is that you don't need to specify the language\nof the source text. Only the target language needs to be defined. The translation\naccuracy will depend on the LLM\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\n\nreviews |>\n llm_translate(review, \"spanish\")\n#> # A tibble: 3 × 2\n#> review .translation \n#> \n#> 1 This has been the best TV I've ever used. Gr… Esta ha sido la mejor televisió…\n#> 2 I regret buying this laptop. It is too slow … Me arrepiento de comprar este p…\n#> 3 Not sure how to feel about my new washing ma… No estoy seguro de cómo me sien…\n```\n:::\n\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\n\nreviews.llm.translate(\"review\", \"spanish\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewtranslation
strstr
"This has been the best TV I've…"Esta ha sido la mejor TV que h…
"I regret buying this laptop. I…"Lo lamento comprar este portát…
"Not sure how to feel about my …"No estoy seguro de cómo sentir…
\n```\n\n:::\n:::\n\n\n\n:::\n\n### Custom prompt\n\nIt is possible to pass your own prompt to the LLM, and have `mall` run it \nagainst each text entry. Use `llm_custom()` to access this functionality: \n\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmy_prompt <- paste(\n \"Answer a question.\",\n \"Return only the answer, no explanation\",\n \"Acceptable answers are 'yes', 'no'\",\n \"Answer this about the following text, is this a happy customer?:\"\n)\n\nreviews |>\n llm_custom(review, my_prompt)\n#> # A tibble: 3 × 2\n#> review .pred\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. Yes \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noi… No \n#> 3 Not sure how to feel about my new washing machine. Great color, but har… No\n```\n:::\n\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nmy_prompt = \"Answer a question.\" \\\n + \"Return only the answer, no explanation\" \\\n + \"Acceptable answers are 'yes', 'no'\" \\\n + \"Answer this about the following text, is this a happy customer?:\"\n\n\nreviews.llm.custom(\"review\", prompt = my_prompt)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewcustom
strstr
"This has been the best TV I've…"Yes"
"I regret buying this laptop. I…"No"
"Not sure how to feel about my …"No"
\n```\n\n:::\n:::\n\n\n\n:::\n\n## Initialize session\n\nInvoking an `llm` function will automatically initialize a model selection\nif you don't have one selected yet. If there is only one option, it will \npre-select it for you. If there are more than one available models, then `mall`\nwill present you as menu selection so you can select which model you wish to \nuse.\n\nCalling `llm_use()` directly will let you specify the model and backend to use.\nYou can also setup additional arguments that will be passed down to the \nfunction that actually runs the prediction. In the case of Ollama, that function\nis [`chat()`](https://hauselin.github.io/ollama-r/reference/chat.html). \n\n\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nThe model to use, and other options can be set for the current R session\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(\"ollama\", \"llama3.2\", seed = 100, temperature = 0)\n```\n:::\n\n\n\n\n## python \n\nThe model to use, and other options can be set for the specific Polars\ndata frame\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(options = dict(seed = 100))\n```\n:::\n\n\n\n:::\n\n## Key considerations\n\nThe main consideration is **cost**. Either, time cost, or money cost.\n\nIf using this method with an LLM locally available, the cost will be a long \nrunning time. Unless using a very specialized LLM, a given LLM is a general model. \nIt was fitted using a vast amount of data. So determining a response for each \nrow, takes longer than if using a manually created NLP model. The default model\nused in Ollama is [Llama 3.2](https://ollama.com/library/llama3.2), \nwhich was fitted using 3B parameters. \n\nIf using an external LLM service, the consideration will need to be for the \nbilling costs of using such service. Keep in mind that you will be sending a lot\nof data to be evaluated. \n\nAnother consideration is the novelty of this approach. Early tests are \nproviding encouraging results. But you, as an user, will still need to keep\nin mind that the predictions will not be infallible, so always check the output.\nAt this time, I think the best use for this method, is for a quick analysis.\n\n## Performance\n\nWe will briefly cover this methods performance from two perspectives: \n\n- How long the analysis takes to run locally \n\n- How well it predicts \n\nTo do so, we will use the `data_bookReviews` data set, provided by the `classmap`\npackage. For this exercise, only the first 100, of the total 1,000, are going\nto be part of this analysis.\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(classmap)\n\ndata(data_bookReviews)\n\ndata_bookReviews |>\n glimpse()\n#> Rows: 1,000\n#> Columns: 2\n#> $ review \"i got this as both a book and an audio file. i had waited t…\n#> $ sentiment 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, …\n```\n:::\n\n\nAs per the docs, `sentiment` is a factor indicating the sentiment of the review:\nnegative (1) or positive (2)\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlength(strsplit(paste(head(data_bookReviews$review, 100), collapse = \" \"), \" \")[[1]])\n#> [1] 20470\n```\n:::\n\n\n\nJust to get an idea of how much data we're processing, I'm using a very, very \nsimple word count. So we're analyzing a bit over 20 thousand words.\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews_llm <- data_bookReviews |>\n head(100) |> \n llm_sentiment(\n col = review,\n options = c(\"positive\" ~ 2, \"negative\" ~ 1),\n pred_name = \"predicted\"\n )\n#> ! There were 2 predictions with invalid output, they were coerced to NA\n```\n:::\n\n\n\nAs far as **time**, on my Apple M3 machine, it took about 1.5 minutes to process,\n100 rows, containing 20 thousand words. Setting `temp` to 0 in `llm_use()`, \nmade the model run faster.\n\nThe package uses `purrr` to send each prompt individually to the LLM. But, I did\ntry a few different ways to speed up the process, unsuccessfully:\n\n- Used `furrr` to send multiple requests at a time. This did not work because \neither the LLM or Ollama processed all my requests serially. So there was\nno improvement.\n\n- I also tried sending more than one row's text at a time. This cause instability\nin the number of results. For example sending 5 at a time, sometimes returned 7\nor 8. Even sending 2 was not stable. \n\nThis is what the new table looks like:\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews_llm\n#> # A tibble: 100 × 3\n#> review sentiment predicted\n#> \n#> 1 \"i got this as both a book and an audio file… 1 1\n#> 2 \"this book places too much emphasis on spend… 1 1\n#> 3 \"remember the hollywood blacklist? the holly… 2 2\n#> 4 \"while i appreciate what tipler was attempti… 1 1\n#> 5 \"the others in the series were great, and i … 1 1\n#> 6 \"a few good things, but she's lost her edge … 1 1\n#> 7 \"words cannot describe how ripped off and di… 1 1\n#> 8 \"1. the persective of most writers is shaped… 1 NA\n#> 9 \"i have been a huge fan of michael crichton … 1 1\n#> 10 \"i saw dr. polk on c-span a month or two ago… 2 2\n#> # ℹ 90 more rows\n```\n:::\n\n\n\nI used `yardstick` to see how well the model performed. Of course, the accuracy\nwill not be of the \"truth\", but rather the package's results recorded in \n`sentiment`.\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(forcats)\n\nreviews_llm |>\n mutate(predicted = as.factor(predicted)) |>\n yardstick::accuracy(sentiment, predicted)\n#> # A tibble: 1 × 3\n#> .metric .estimator .estimate\n#> \n#> 1 accuracy binary 0.980\n```\n:::\n\n\n\n## Vector functions (R only)\n\n`mall` includes functions that expect a vector, instead of a table, to run the\npredictions. This should make it easier to test things, such as custom prompts\nor results of specific text. Each `llm_` function has a corresponding `llm_vec_`\nfunction:\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_sentiment(\"I am happy\")\n#> [1] \"positive\"\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_translate(\"Este es el mejor dia!\", \"english\")\n#> [1] \"It's the best day!\"\n```\n:::\n", + "supporting": [], + "filters": [ + "rmarkdown/pagebreak.lua" + ], + "includes": {}, + "engineDependencies": {}, + "preserve": {}, + "postProcess": true + } +} \ No newline at end of file diff --git a/index.qmd b/index.qmd index 3555e93..4ceeb97 100644 --- a/index.qmd +++ b/index.qmd @@ -3,8 +3,8 @@ format: html: toc: true execute: - eval: false - freeze: false + eval: true + freeze: true --- @@ -177,7 +177,7 @@ reviews #| include: false -reviews.llm.use(options = dict(seed = 100)) +reviews.llm.use(options = dict(seed = 100), _cache = "_readme_cache") ``` For the first example, we'll asses the sentiment of each review. In order to @@ -274,7 +274,7 @@ reviews.llm.summarize("review", 5) ::: To control the name of the prediction field, you can change `pred_name` argument. -This works with the other `llm_` functions as well. +This works with the other `llm` functions as well. ::: {.panel-tabset group="language"} ## R @@ -326,11 +326,25 @@ that the LLM return the product being referred to. We do this by simply saying in the text. +::: {.panel-tabset group="language"} +## R + ```{r} reviews |> llm_extract(review, "product") ``` + +## python + +```{python} + +reviews.llm.extract("review", "product") +``` + +::: + + ### Translate As the title implies, this function will translate the text into a specified @@ -364,6 +378,10 @@ reviews.llm.translate("review", "spanish") It is possible to pass your own prompt to the LLM, and have `mall` run it against each text entry. Use `llm_custom()` to access this functionality: + +::: {.panel-tabset group="language"} +## R + ```{r} my_prompt <- paste( "Answer a question.", @@ -376,9 +394,25 @@ reviews |> llm_custom(review, my_prompt) ``` + +## python + +```{python} + +my_prompt = "Answer a question." \ + + "Return only the answer, no explanation" \ + + "Acceptable answers are 'yes', 'no'" \ + + "Answer this about the following text, is this a happy customer?:" + + +reviews.llm.custom("review", prompt = my_prompt) +``` + +::: + ## Initialize session -Invoking an `llm_` function will automatically initialize a model selection +Invoking an `llm` function will automatically initialize a model selection if you don't have one selected yet. If there is only one option, it will pre-select it for you. If there are more than one available models, then `mall` will present you as menu selection so you can select which model you wish to From 9d6b5b13630a7b95d1294eb201e9fa4ab54c42d8 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 7 Oct 2024 14:47:18 -0500 Subject: [PATCH 54/57] First pass at updating wording --- _freeze/index/execute-results/html.json | 4 +-- index.qmd | 44 +++++++++++++------------ 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/_freeze/index/execute-results/html.json b/_freeze/index/execute-results/html.json index bf47ab4..18fd744 100644 --- a/_freeze/index/execute-results/html.json +++ b/_freeze/index/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "21609c68109605e29ecc8b10d4d012b3", + "hash": "31f1711bc2fee6106b9122901fbc16c3", "result": { "engine": "knitr", - "markdown": "---\nformat:\n html:\n toc: true\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n\n\nRun multiple LLM predictions against a data frame. The predictions are processed \nrow-wise over a specified column. It works using a pre-determined one-shot prompt,\nalong with the current row's content. The prompt that is use will depend of the\ntype of analysis needed. \n\nCurrently, the included prompts perform the following: \n\n- [Sentiment analysis](#sentiment)\n- [Text summarizing](#summarize)\n- [Classify text](#classify)\n- [Extract one, or several](#extract), specific pieces information from the text\n- [Translate text](#translate)\n- [Custom prompt](#custom-prompt)\n\nThis package is inspired by the SQL AI functions now offered by vendors such as\n[Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) \nand Snowflake. `mall` uses [Ollama](https://ollama.com/) to interact with LLMs \ninstalled locally. \n\n\nFor R, that interaction takes place via the \n[`ollamar`](https://hauselin.github.io/ollama-r/) package. The functions are \ndesigned to easily work with piped commands, such as `dplyr`. \n\n```r\nreviews |>\n llm_sentiment(review)\n```\n\n\nFor Python, `mall` includes an extension to [Polars](https://pola.rs/). To\ninteract with Ollama, it uses the official [Python library](https://github.com/ollama/ollama-python).\n\n```python\nreviews.llm.sentiment(\"review\")\n```\n\n## Motivation\n\nWe want to new find ways to help data scientists use LLMs in their daily work. \nUnlike the familiar interfaces, such as chatting and code completion, this interface\nruns your text data directly against the LLM. \n\nThe LLM's flexibility, allows for it to adapt to the subject of your data, and \nprovide surprisingly accurate predictions. This saves the data scientist the\nneed to write and tune an NLP model. \n\nIn recent times, the capabilities of LLMs that can run locally in your computer \nhave increased dramatically. This means that these sort of analysis can run\nin your machine with good accuracy. Additionally, it makes it possible to take\nadvantage of LLM's at your institution, since the data will not leave the\ncorporate network. \n\n## Get started\n\n- Install `mall` from Github\n\n \n::: {.panel-tabset group=\"language\"}\n## R\n```r\npak::pak(\"edgararuiz/mall/r@python\")\n```\n\n## python\n```python\npip install \"mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python\"\n```\n:::\n \n\n### With local LLMs\n\n- [Download Ollama from the official website](https://ollama.com/download)\n\n- Install and start Ollama in your computer\n\n\n::: {.panel-tabset group=\"language\"}\n## R\n- Install Ollama in your machine. The `ollamar` package's website provides this\n[Installation guide](https://hauselin.github.io/ollama-r/#installation)\n\n- Download an LLM model. For example, I have been developing this package using\nLlama 3.2 to test. To get that model you can run: \n ```r\n ollamar::pull(\"llama3.2\")\n ```\n \n## python\n\n- Install the official Ollama library\n ```python\n pip install ollama\n ```\n\n- Download an LLM model. For example, I have been developing this package using\nLlama 3.2 to test. To get that model you can run: \n ```python\n import ollama\n ollama.pull('llama3.2')\n ```\n:::\n\n\n \n### With Databricks (R only)\n\nIf you pass a table connected to **Databricks** via `odbc`, `mall` will \nautomatically use Databricks' LLM instead of Ollama. *You won't need Ollama \ninstalled if you are using Databricks only.*\n\n`mall` will call the appropriate SQL AI function. For more information see our \n[Databricks article.](https://edgararuiz.github.io/mall/articles/databricks.html) \n\n## LLM functions\n\n### Sentiment\n\nPrimarily, `mall` provides verb-like functions that expect a `tbl` as \ntheir first argument. This allows us to use them in piped operations. \n\nWe will start with loading a very small data set contained in `mall`. It has\n3 product reviews that we will use as the source of our examples.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\n\ndata(\"reviews\")\n\nreviews\n#> # A tibble: 3 × 1\n#> review \n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noisy \n#> 3 Not sure how to feel about my new washing machine. Great color, but hard to f…\n```\n:::\n\n\n\n## python\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nimport mall \nimport polars as pl\n\ndata = mall.MallData\n\nreviews = data.reviews\nreviews \n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 1)
review
str
"This has been the best TV I've…
"I regret buying this laptop. I…
"Not sure how to feel about my …
\n```\n\n:::\n:::\n\n\n:::\n\n\n\n\n\n\n\nFor the first example, we'll asses the sentiment of each review. In order to \ndo this we will call `llm_sentiment()`:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\nreviews |>\n llm_sentiment(review)\n#> # A tibble: 3 × 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… neutral\n```\n:::\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.sentiment(\"review\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewsentiment
strstr
"This has been the best TV I've…"positive"
"I regret buying this laptop. I…"negative"
"Not sure how to feel about my …"neutral"
\n```\n\n:::\n:::\n\n\n\n:::\n\nThe function let's us modify the options to choose from: \n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\nreviews |>\n llm_sentiment(review, options = c(\"positive\", \"negative\"))\n#> # A tibble: 3 × 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… negative\n```\n:::\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.sentiment(\"review\", options=[\"positive\", \"negative\"])\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewsentiment
strstr
"This has been the best TV I've…"positive"
"I regret buying this laptop. I…"negative"
"Not sure how to feel about my …"negative"
\n```\n\n:::\n:::\n\n\n\n:::\n\nAs mentioned before, by being pipe friendly, the results from the LLM prediction\ncan be used in further transformations: \n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\nreviews |>\n llm_sentiment(review, options = c(\"positive\", \"negative\")) |>\n filter(.sentiment == \"negative\")\n#> # A tibble: 2 × 2\n#> review .sentiment\n#> \n#> 1 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 2 Not sure how to feel about my new washing machine. Great color, bu… negative\n```\n:::\n\n\n\n## python\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nx = reviews.llm.sentiment(\"review\", options=[\"positive\", \"negative\"])\n\nx.filter(pl.col(\"sentiment\") == \"negative\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (2, 2)
reviewsentiment
strstr
"I regret buying this laptop. I…"negative"
"Not sure how to feel about my …"negative"
\n```\n\n:::\n:::\n\n\n\n:::\n\n### Summarize\n\nThere may be a need to reduce the number of words in a given text. Usually, to \nmake it easier to capture its intent. To do this, use `llm_summarize()`. This\nfunction has an argument to control the maximum number of words to output \n(`max_words`):\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\n\nreviews |>\n llm_summarize(review, max_words = 5)\n#> # A tibble: 3 × 2\n#> review .summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… it's a great tv \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n```\n:::\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\n\nreviews.llm.summarize(\"review\", 5)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewsummary
strstr
"This has been the best TV I've…"it's a great tv"
"I regret buying this laptop. I…"laptop not worth the money"
"Not sure how to feel about my …"feeling uncertain about new pu…
\n```\n\n:::\n:::\n\n\n\n:::\n\nTo control the name of the prediction field, you can change `pred_name` argument.\nThis works with the other `llm` functions as well. \n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\nreviews |>\n llm_summarize(review, max_words = 5, pred_name = \"review_summary\")\n#> # A tibble: 3 × 2\n#> review review_summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… it's a great tv \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n```\n:::\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.summarize(\"review\", max_words = 5, pred_name = \"review_summary\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewreview_summary
strstr
"This has been the best TV I've…"it's a great tv"
"I regret buying this laptop. I…"laptop not worth the money"
"Not sure how to feel about my …"feeling uncertain about new pu…
\n```\n\n:::\n:::\n\n\n\n:::\n\n### Classify\n\nUse the LLM to categorize the text into one of the options you provide: \n\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_classify(review, c(\"appliance\", \"computer\"))\n#> # A tibble: 3 × 2\n#> review .classify\n#> \n#> 1 This has been the best TV I've ever used. Gr… computer \n#> 2 I regret buying this laptop. It is too slow … computer \n#> 3 Not sure how to feel about my new washing ma… appliance\n```\n:::\n\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.classify(\"review\", [\"computer\", \"appliance\"])\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewclassify
strstr
"This has been the best TV I've…"appliance"
"I regret buying this laptop. I…"appliance"
"Not sure how to feel about my …"appliance"
\n```\n\n:::\n:::\n\n\n\n:::\n\n### Extract \n\nOne of the most interesting operations. Using natural language, we can tell the \nLLM to return a specific part of the text. In the following example, we request\nthat the LLM return the product being referred to. We do this by simply saying \n\"product\". The LLM understands what we *mean* by that word, and looks for that\nin the text.\n\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_extract(review, \"product\")\n#> # A tibble: 3 × 2\n#> review .extract \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n```\n:::\n\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.extract(\"review\", \"product\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewextract
strstr
"This has been the best TV I've…"tv"
"I regret buying this laptop. I…"laptop"
"Not sure how to feel about my …"washing machine"
\n```\n\n:::\n:::\n\n\n\n:::\n\n\n### Translate\n\nAs the title implies, this function will translate the text into a specified \nlanguage. What is really nice, it is that you don't need to specify the language\nof the source text. Only the target language needs to be defined. The translation\naccuracy will depend on the LLM\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\n\nreviews |>\n llm_translate(review, \"spanish\")\n#> # A tibble: 3 × 2\n#> review .translation \n#> \n#> 1 This has been the best TV I've ever used. Gr… Esta ha sido la mejor televisió…\n#> 2 I regret buying this laptop. It is too slow … Me arrepiento de comprar este p…\n#> 3 Not sure how to feel about my new washing ma… No estoy seguro de cómo me sien…\n```\n:::\n\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\n\nreviews.llm.translate(\"review\", \"spanish\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewtranslation
strstr
"This has been the best TV I've…"Esta ha sido la mejor TV que h…
"I regret buying this laptop. I…"Lo lamento comprar este portát…
"Not sure how to feel about my …"No estoy seguro de cómo sentir…
\n```\n\n:::\n:::\n\n\n\n:::\n\n### Custom prompt\n\nIt is possible to pass your own prompt to the LLM, and have `mall` run it \nagainst each text entry. Use `llm_custom()` to access this functionality: \n\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmy_prompt <- paste(\n \"Answer a question.\",\n \"Return only the answer, no explanation\",\n \"Acceptable answers are 'yes', 'no'\",\n \"Answer this about the following text, is this a happy customer?:\"\n)\n\nreviews |>\n llm_custom(review, my_prompt)\n#> # A tibble: 3 × 2\n#> review .pred\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. Yes \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noi… No \n#> 3 Not sure how to feel about my new washing machine. Great color, but har… No\n```\n:::\n\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nmy_prompt = \"Answer a question.\" \\\n + \"Return only the answer, no explanation\" \\\n + \"Acceptable answers are 'yes', 'no'\" \\\n + \"Answer this about the following text, is this a happy customer?:\"\n\n\nreviews.llm.custom(\"review\", prompt = my_prompt)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewcustom
strstr
"This has been the best TV I've…"Yes"
"I regret buying this laptop. I…"No"
"Not sure how to feel about my …"No"
\n```\n\n:::\n:::\n\n\n\n:::\n\n## Initialize session\n\nInvoking an `llm` function will automatically initialize a model selection\nif you don't have one selected yet. If there is only one option, it will \npre-select it for you. If there are more than one available models, then `mall`\nwill present you as menu selection so you can select which model you wish to \nuse.\n\nCalling `llm_use()` directly will let you specify the model and backend to use.\nYou can also setup additional arguments that will be passed down to the \nfunction that actually runs the prediction. In the case of Ollama, that function\nis [`chat()`](https://hauselin.github.io/ollama-r/reference/chat.html). \n\n\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nThe model to use, and other options can be set for the current R session\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(\"ollama\", \"llama3.2\", seed = 100, temperature = 0)\n```\n:::\n\n\n\n\n## python \n\nThe model to use, and other options can be set for the specific Polars\ndata frame\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(options = dict(seed = 100))\n```\n:::\n\n\n\n:::\n\n## Key considerations\n\nThe main consideration is **cost**. Either, time cost, or money cost.\n\nIf using this method with an LLM locally available, the cost will be a long \nrunning time. Unless using a very specialized LLM, a given LLM is a general model. \nIt was fitted using a vast amount of data. So determining a response for each \nrow, takes longer than if using a manually created NLP model. The default model\nused in Ollama is [Llama 3.2](https://ollama.com/library/llama3.2), \nwhich was fitted using 3B parameters. \n\nIf using an external LLM service, the consideration will need to be for the \nbilling costs of using such service. Keep in mind that you will be sending a lot\nof data to be evaluated. \n\nAnother consideration is the novelty of this approach. Early tests are \nproviding encouraging results. But you, as an user, will still need to keep\nin mind that the predictions will not be infallible, so always check the output.\nAt this time, I think the best use for this method, is for a quick analysis.\n\n## Performance\n\nWe will briefly cover this methods performance from two perspectives: \n\n- How long the analysis takes to run locally \n\n- How well it predicts \n\nTo do so, we will use the `data_bookReviews` data set, provided by the `classmap`\npackage. For this exercise, only the first 100, of the total 1,000, are going\nto be part of this analysis.\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(classmap)\n\ndata(data_bookReviews)\n\ndata_bookReviews |>\n glimpse()\n#> Rows: 1,000\n#> Columns: 2\n#> $ review \"i got this as both a book and an audio file. i had waited t…\n#> $ sentiment 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, …\n```\n:::\n\n\nAs per the docs, `sentiment` is a factor indicating the sentiment of the review:\nnegative (1) or positive (2)\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlength(strsplit(paste(head(data_bookReviews$review, 100), collapse = \" \"), \" \")[[1]])\n#> [1] 20470\n```\n:::\n\n\n\nJust to get an idea of how much data we're processing, I'm using a very, very \nsimple word count. So we're analyzing a bit over 20 thousand words.\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews_llm <- data_bookReviews |>\n head(100) |> \n llm_sentiment(\n col = review,\n options = c(\"positive\" ~ 2, \"negative\" ~ 1),\n pred_name = \"predicted\"\n )\n#> ! There were 2 predictions with invalid output, they were coerced to NA\n```\n:::\n\n\n\nAs far as **time**, on my Apple M3 machine, it took about 1.5 minutes to process,\n100 rows, containing 20 thousand words. Setting `temp` to 0 in `llm_use()`, \nmade the model run faster.\n\nThe package uses `purrr` to send each prompt individually to the LLM. But, I did\ntry a few different ways to speed up the process, unsuccessfully:\n\n- Used `furrr` to send multiple requests at a time. This did not work because \neither the LLM or Ollama processed all my requests serially. So there was\nno improvement.\n\n- I also tried sending more than one row's text at a time. This cause instability\nin the number of results. For example sending 5 at a time, sometimes returned 7\nor 8. Even sending 2 was not stable. \n\nThis is what the new table looks like:\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews_llm\n#> # A tibble: 100 × 3\n#> review sentiment predicted\n#> \n#> 1 \"i got this as both a book and an audio file… 1 1\n#> 2 \"this book places too much emphasis on spend… 1 1\n#> 3 \"remember the hollywood blacklist? the holly… 2 2\n#> 4 \"while i appreciate what tipler was attempti… 1 1\n#> 5 \"the others in the series were great, and i … 1 1\n#> 6 \"a few good things, but she's lost her edge … 1 1\n#> 7 \"words cannot describe how ripped off and di… 1 1\n#> 8 \"1. the persective of most writers is shaped… 1 NA\n#> 9 \"i have been a huge fan of michael crichton … 1 1\n#> 10 \"i saw dr. polk on c-span a month or two ago… 2 2\n#> # ℹ 90 more rows\n```\n:::\n\n\n\nI used `yardstick` to see how well the model performed. Of course, the accuracy\nwill not be of the \"truth\", but rather the package's results recorded in \n`sentiment`.\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(forcats)\n\nreviews_llm |>\n mutate(predicted = as.factor(predicted)) |>\n yardstick::accuracy(sentiment, predicted)\n#> # A tibble: 1 × 3\n#> .metric .estimator .estimate\n#> \n#> 1 accuracy binary 0.980\n```\n:::\n\n\n\n## Vector functions (R only)\n\n`mall` includes functions that expect a vector, instead of a table, to run the\npredictions. This should make it easier to test things, such as custom prompts\nor results of specific text. Each `llm_` function has a corresponding `llm_vec_`\nfunction:\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_sentiment(\"I am happy\")\n#> [1] \"positive\"\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_translate(\"Este es el mejor dia!\", \"english\")\n#> [1] \"It's the best day!\"\n```\n:::\n", + "markdown": "---\nformat:\n html:\n toc: true\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n\n\nRun multiple LLM predictions against a data frame. The predictions are processed \nrow-wise over a specified column. It works using a pre-determined one-shot prompt,\nalong with the current row's content. The prompt that is use will depend of the\ntype of analysis needed. \n\nCurrently, the included prompts perform the following: \n\n- [Sentiment analysis](#sentiment)\n- [Text summarizing](#summarize)\n- [Classify text](#classify)\n- [Extract one, or several](#extract), specific pieces information from the text\n- [Translate text](#translate)\n- [Custom prompt](#custom-prompt)\n\nThis package is inspired by the SQL AI functions now offered by vendors such as\n[Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) \nand Snowflake. `mall` uses [Ollama](https://ollama.com/) to interact with LLMs \ninstalled locally. \n\n\nFor R, that interaction takes place via the \n[`ollamar`](https://hauselin.github.io/ollama-r/) package. The functions are \ndesigned to easily work with piped commands, such as `dplyr`. \n\n```r\nreviews |>\n llm_sentiment(review)\n```\n\n\nFor Python, `mall` includes an extension to [Polars](https://pola.rs/). To\ninteract with Ollama, it uses the official [Python library](https://github.com/ollama/ollama-python).\n\n```python\nreviews.llm.sentiment(\"review\")\n```\n\n## Motivation\n\nWe want to new find ways to help data scientists use LLMs in their daily work. \nUnlike the familiar interfaces, such as chatting and code completion, this interface\nruns your text data directly against the LLM. \n\nThe LLM's flexibility, allows for it to adapt to the subject of your data, and \nprovide surprisingly accurate predictions. This saves the data scientist the\nneed to write and tune an NLP model. \n\nIn recent times, the capabilities of LLMs that can run locally in your computer \nhave increased dramatically. This means that these sort of analysis can run\nin your machine with good accuracy. Additionally, it makes it possible to take\nadvantage of LLM's at your institution, since the data will not leave the\ncorporate network. \n\n## Get started\n\n- Install `mall` from Github\n\n \n::: {.panel-tabset group=\"language\"}\n## R\n```r\npak::pak(\"edgararuiz/mall/r@python\")\n```\n\n## python\n```python\npip install \"mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python\"\n```\n:::\n \n\n### With local LLMs\n\n- [Download Ollama from the official website](https://ollama.com/download)\n\n- Install and start Ollama in your computer\n\n\n::: {.panel-tabset group=\"language\"}\n## R\n- Install Ollama in your machine. The `ollamar` package's website provides this\n[Installation guide](https://hauselin.github.io/ollama-r/#installation)\n\n- Download an LLM model. For example, I have been developing this package using\nLlama 3.2 to test. To get that model you can run: \n ```r\n ollamar::pull(\"llama3.2\")\n ```\n \n## python\n\n- Install the official Ollama library\n ```python\n pip install ollama\n ```\n\n- Download an LLM model. For example, I have been developing this package using\nLlama 3.2 to test. To get that model you can run: \n ```python\n import ollama\n ollama.pull('llama3.2')\n ```\n:::\n\n\n \n### With Databricks (R only)\n\nIf you pass a table connected to **Databricks** via `odbc`, `mall` will \nautomatically use Databricks' LLM instead of Ollama. *You won't need Ollama \ninstalled if you are using Databricks only.*\n\n`mall` will call the appropriate SQL AI function. For more information see our \n[Databricks article.](https://edgararuiz.github.io/mall/articles/databricks.html) \n\n## LLM functions\n\n### Sentiment\n\nPrimarily, `mall` provides verb-like functions that expect a data frame as \ntheir first argument. \n\nWe will start with loading a very small data set contained in `mall`. It has\n3 product reviews that we will use as the source of our examples.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\n\ndata(\"reviews\")\n\nreviews\n#> # A tibble: 3 × 1\n#> review \n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noisy \n#> 3 Not sure how to feel about my new washing machine. Great color, but hard to f…\n```\n:::\n\n\n\n## python\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nimport mall \nimport polars as pl\n\ndata = mall.MallData\n\nreviews = data.reviews\nreviews \n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 1)
review
str
"This has been the best TV I've…
"I regret buying this laptop. I…
"Not sure how to feel about my …
\n```\n\n:::\n:::\n\n\n:::\n\n\n\n\n\n\n\nFor the first example, we'll asses the sentiment of each review:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\nreviews |>\n llm_sentiment(review)\n#> # A tibble: 3 × 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… neutral\n```\n:::\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.sentiment(\"review\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewsentiment
strstr
"This has been the best TV I've…"positive"
"I regret buying this laptop. I…"negative"
"Not sure how to feel about my …"neutral"
\n```\n\n:::\n:::\n\n\n\n:::\n\nWe can also provide custom sentiment labels. Use the `options` argument to control\nthat:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\nreviews |>\n llm_sentiment(review, options = c(\"positive\", \"negative\"))\n#> # A tibble: 3 × 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… negative\n```\n:::\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.sentiment(\"review\", options=[\"positive\", \"negative\"])\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewsentiment
strstr
"This has been the best TV I've…"positive"
"I regret buying this laptop. I…"negative"
"Not sure how to feel about my …"negative"
\n```\n\n:::\n:::\n\n\n\n:::\n\nAs mentioned before, these functions are create to play well with the rest of \nthe analysis\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\nreviews |>\n llm_sentiment(review, options = c(\"positive\", \"negative\")) |>\n filter(.sentiment == \"negative\")\n#> # A tibble: 2 × 2\n#> review .sentiment\n#> \n#> 1 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 2 Not sure how to feel about my new washing machine. Great color, bu… negative\n```\n:::\n\n\n\n## python\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nx = reviews.llm.sentiment(\"review\", options=[\"positive\", \"negative\"])\n\nx.filter(pl.col(\"sentiment\") == \"negative\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (2, 2)
reviewsentiment
strstr
"I regret buying this laptop. I…"negative"
"Not sure how to feel about my …"negative"
\n```\n\n:::\n:::\n\n\n\n:::\n\n### Summarize\n\nThere may be a need to reduce the number of words in a given text. Typically to \nmake it easier to understand its intent. The function has an argument to \ncontrol the maximum number of words to output \n(`max_words`):\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\n\nreviews |>\n llm_summarize(review, max_words = 5)\n#> # A tibble: 3 × 2\n#> review .summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… it's a great tv \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n```\n:::\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\n\nreviews.llm.summarize(\"review\", 5)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewsummary
strstr
"This has been the best TV I've…"it's a great tv"
"I regret buying this laptop. I…"laptop not worth the money"
"Not sure how to feel about my …"feeling uncertain about new pu…
\n```\n\n:::\n:::\n\n\n\n:::\n\nTo control the name of the prediction field, you can change `pred_name` argument.\n**This works with the other `llm` functions as well.**\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\nreviews |>\n llm_summarize(review, max_words = 5, pred_name = \"review_summary\")\n#> # A tibble: 3 × 2\n#> review review_summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… it's a great tv \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n```\n:::\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.summarize(\"review\", max_words = 5, pred_name = \"review_summary\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewreview_summary
strstr
"This has been the best TV I've…"it's a great tv"
"I regret buying this laptop. I…"laptop not worth the money"
"Not sure how to feel about my …"feeling uncertain about new pu…
\n```\n\n:::\n:::\n\n\n\n:::\n\n### Classify\n\nUse the LLM to categorize the text into one of the options you provide: \n\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_classify(review, c(\"appliance\", \"computer\"))\n#> # A tibble: 3 × 2\n#> review .classify\n#> \n#> 1 This has been the best TV I've ever used. Gr… computer \n#> 2 I regret buying this laptop. It is too slow … computer \n#> 3 Not sure how to feel about my new washing ma… appliance\n```\n:::\n\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.classify(\"review\", [\"computer\", \"appliance\"])\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewclassify
strstr
"This has been the best TV I've…"appliance"
"I regret buying this laptop. I…"appliance"
"Not sure how to feel about my …"appliance"
\n```\n\n:::\n:::\n\n\n\n:::\n\n### Extract \n\nOne of the most interesting use cases Using natural language, we can tell the \nLLM to return a specific part of the text. In the following example, we request\nthat the LLM return the product being referred to. We do this by simply saying \n\"product\". The LLM understands what we *mean* by that word, and looks for that\nin the text.\n\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_extract(review, \"product\")\n#> # A tibble: 3 × 2\n#> review .extract \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n```\n:::\n\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nreviews.llm.extract(\"review\", \"product\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewextract
strstr
"This has been the best TV I've…"tv"
"I regret buying this laptop. I…"laptop"
"Not sure how to feel about my …"washing machine"
\n```\n\n:::\n:::\n\n\n\n:::\n\n\n### Translate\n\nAs the title implies, this function will translate the text into a specified \nlanguage. What is really nice, it is that you don't need to specify the language\nof the source text. Only the target language needs to be defined. The translation\naccuracy will depend on the LLM\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n\n\nreviews |>\n llm_translate(review, \"spanish\")\n#> # A tibble: 3 × 2\n#> review .translation \n#> \n#> 1 This has been the best TV I've ever used. Gr… Esta ha sido la mejor televisió…\n#> 2 I regret buying this laptop. It is too slow … Me arrepiento de comprar este p…\n#> 3 Not sure how to feel about my new washing ma… No estoy seguro de cómo me sien…\n```\n:::\n\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\n\nreviews.llm.translate(\"review\", \"spanish\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewtranslation
strstr
"This has been the best TV I've…"Esta ha sido la mejor TV que h…
"I regret buying this laptop. I…"Lo lamento comprar este portát…
"Not sure how to feel about my …"No estoy seguro de cómo sentir…
\n```\n\n:::\n:::\n\n\n\n:::\n\n### Custom prompt\n\nIt is possible to pass your own prompt to the LLM, and have `mall` run it \nagainst each text entry:\n\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmy_prompt <- paste(\n \"Answer a question.\",\n \"Return only the answer, no explanation\",\n \"Acceptable answers are 'yes', 'no'\",\n \"Answer this about the following text, is this a happy customer?:\"\n)\n\nreviews |>\n llm_custom(review, my_prompt)\n#> # A tibble: 3 × 2\n#> review .pred\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. Yes \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noi… No \n#> 3 Not sure how to feel about my new washing machine. Great color, but har… No\n```\n:::\n\n\n\n\n## python \n\n\n\n::: {.cell}\n\n```{.python .cell-code}\n\nmy_prompt = \"Answer a question.\" \\\n + \"Return only the answer, no explanation\" \\\n + \"Acceptable answers are 'yes', 'no'\" \\\n + \"Answer this about the following text, is this a happy customer?:\"\n\n\nreviews.llm.custom(\"review\", prompt = my_prompt)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\nshape: (3, 2)
reviewcustom
strstr
"This has been the best TV I've…"Yes"
"I regret buying this laptop. I…"No"
"Not sure how to feel about my …"No"
\n```\n\n:::\n:::\n\n\n\n:::\n\n## Model selection and settings\n\nYou can set the model and its options to use when calling the LLM. In this case,\nwe refer to options as model specific things that can be set, such as seed or\ntemperature. \n\n::: {.panel-tabset group=\"language\"}\n## R\n\nInvoking an `llm` function will automatically initialize a model selection\nif you don't have one selected yet. If there is only one option, it will \npre-select it for you. If there are more than one available models, then `mall`\nwill present you as menu selection so you can select which model you wish to \nuse.\n\nCalling `llm_use()` directly will let you specify the model and backend to use.\nYou can also setup additional arguments that will be passed down to the \nfunction that actually runs the prediction. In the case of Ollama, that function\nis [`chat()`](https://hauselin.github.io/ollama-r/reference/chat.html). \n\nThe model to use, and other options can be set for the current R session\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(\"ollama\", \"llama3.2\", seed = 100, temperature = 0)\n```\n:::\n\n\n\n\n## python \n\nThe model and options to be used will be defined at the Polars data frame \nobject level. If not passed, the default model will be **llama3.2**.\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(options = dict(seed = 100))\n```\n:::\n\n\n\n:::\n\n## Key considerations\n\nThe main consideration is **cost**. Either, time cost, or money cost.\n\nIf using this method with an LLM locally available, the cost will be a long \nrunning time. Unless using a very specialized LLM, a given LLM is a general model. \nIt was fitted using a vast amount of data. So determining a response for each \nrow, takes longer than if using a manually created NLP model. The default model\nused in Ollama is [Llama 3.2](https://ollama.com/library/llama3.2), \nwhich was fitted using 3B parameters. \n\nIf using an external LLM service, the consideration will need to be for the \nbilling costs of using such service. Keep in mind that you will be sending a lot\nof data to be evaluated. \n\nAnother consideration is the novelty of this approach. Early tests are \nproviding encouraging results. But you, as an user, will still need to keep\nin mind that the predictions will not be infallible, so always check the output.\nAt this time, I think the best use for this method, is for a quick analysis.\n\n## Performance\n\nWe will briefly cover this methods performance from two perspectives: \n\n- How long the analysis takes to run locally \n\n- How well it predicts \n\nTo do so, we will use the `data_bookReviews` data set, provided by the `classmap`\npackage. For this exercise, only the first 100, of the total 1,000, are going\nto be part of this analysis.\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(classmap)\n\ndata(data_bookReviews)\n\ndata_bookReviews |>\n glimpse()\n#> Rows: 1,000\n#> Columns: 2\n#> $ review \"i got this as both a book and an audio file. i had waited t…\n#> $ sentiment 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, …\n```\n:::\n\n\nAs per the docs, `sentiment` is a factor indicating the sentiment of the review:\nnegative (1) or positive (2)\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlength(strsplit(paste(head(data_bookReviews$review, 100), collapse = \" \"), \" \")[[1]])\n#> [1] 20470\n```\n:::\n\n\n\nJust to get an idea of how much data we're processing, I'm using a very, very \nsimple word count. So we're analyzing a bit over 20 thousand words.\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews_llm <- data_bookReviews |>\n head(100) |> \n llm_sentiment(\n col = review,\n options = c(\"positive\" ~ 2, \"negative\" ~ 1),\n pred_name = \"predicted\"\n )\n#> ! There were 2 predictions with invalid output, they were coerced to NA\n```\n:::\n\n\n\nAs far as **time**, on my Apple M3 machine, it took about 1.5 minutes to process,\n100 rows, containing 20 thousand words. Setting `temp` to 0 in `llm_use()`, \nmade the model run faster.\n\nThe package uses `purrr` to send each prompt individually to the LLM. But, I did\ntry a few different ways to speed up the process, unsuccessfully:\n\n- Used `furrr` to send multiple requests at a time. This did not work because \neither the LLM or Ollama processed all my requests serially. So there was\nno improvement.\n\n- I also tried sending more than one row's text at a time. This cause instability\nin the number of results. For example sending 5 at a time, sometimes returned 7\nor 8. Even sending 2 was not stable. \n\nThis is what the new table looks like:\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews_llm\n#> # A tibble: 100 × 3\n#> review sentiment predicted\n#> \n#> 1 \"i got this as both a book and an audio file… 1 1\n#> 2 \"this book places too much emphasis on spend… 1 1\n#> 3 \"remember the hollywood blacklist? the holly… 2 2\n#> 4 \"while i appreciate what tipler was attempti… 1 1\n#> 5 \"the others in the series were great, and i … 1 1\n#> 6 \"a few good things, but she's lost her edge … 1 1\n#> 7 \"words cannot describe how ripped off and di… 1 1\n#> 8 \"1. the persective of most writers is shaped… 1 NA\n#> 9 \"i have been a huge fan of michael crichton … 1 1\n#> 10 \"i saw dr. polk on c-span a month or two ago… 2 2\n#> # ℹ 90 more rows\n```\n:::\n\n\n\nI used `yardstick` to see how well the model performed. Of course, the accuracy\nwill not be of the \"truth\", but rather the package's results recorded in \n`sentiment`.\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(forcats)\n\nreviews_llm |>\n mutate(predicted = as.factor(predicted)) |>\n yardstick::accuracy(sentiment, predicted)\n#> # A tibble: 1 × 3\n#> .metric .estimator .estimate\n#> \n#> 1 accuracy binary 0.980\n```\n:::\n\n\n\n## Vector functions (R only)\n\n`mall` includes functions that expect a vector, instead of a table, to run the\npredictions. This should make it easier to test things, such as custom prompts\nor results of specific text. Each `llm_` function has a corresponding `llm_vec_`\nfunction:\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_sentiment(\"I am happy\")\n#> [1] \"positive\"\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_translate(\"Este es el mejor dia!\", \"english\")\n#> [1] \"It's the best day!\"\n```\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/index.qmd b/index.qmd index 4ceeb97..f8cf4e8 100644 --- a/index.qmd +++ b/index.qmd @@ -142,8 +142,8 @@ installed if you are using Databricks only.* ### Sentiment -Primarily, `mall` provides verb-like functions that expect a `tbl` as -their first argument. This allows us to use them in piped operations. +Primarily, `mall` provides verb-like functions that expect a data frame as +their first argument. We will start with loading a very small data set contained in `mall`. It has 3 product reviews that we will use as the source of our examples. @@ -180,8 +180,7 @@ reviews reviews.llm.use(options = dict(seed = 100), _cache = "_readme_cache") ``` -For the first example, we'll asses the sentiment of each review. In order to -do this we will call `llm_sentiment()`: +For the first example, we'll asses the sentiment of each review: ::: {.panel-tabset group="language"} ## R @@ -201,7 +200,8 @@ reviews.llm.sentiment("review") ::: -The function let's us modify the options to choose from: +We can also provide custom sentiment labels. Use the `options` argument to control +that: ::: {.panel-tabset group="language"} ## R @@ -221,8 +221,8 @@ reviews.llm.sentiment("review", options=["positive", "negative"]) ::: -As mentioned before, by being pipe friendly, the results from the LLM prediction -can be used in further transformations: +As mentioned before, these functions are create to play well with the rest of +the analysis ::: {.panel-tabset group="language"} ## R @@ -248,9 +248,9 @@ x.filter(pl.col("sentiment") == "negative") ### Summarize -There may be a need to reduce the number of words in a given text. Usually, to -make it easier to capture its intent. To do this, use `llm_summarize()`. This -function has an argument to control the maximum number of words to output +There may be a need to reduce the number of words in a given text. Typically to +make it easier to understand its intent. The function has an argument to +control the maximum number of words to output (`max_words`): ::: {.panel-tabset group="language"} @@ -274,7 +274,7 @@ reviews.llm.summarize("review", 5) ::: To control the name of the prediction field, you can change `pred_name` argument. -This works with the other `llm` functions as well. +**This works with the other `llm` functions as well.** ::: {.panel-tabset group="language"} ## R @@ -319,7 +319,7 @@ reviews.llm.classify("review", ["computer", "appliance"]) ### Extract -One of the most interesting operations. Using natural language, we can tell the +One of the most interesting use cases Using natural language, we can tell the LLM to return a specific part of the text. In the following example, we request that the LLM return the product being referred to. We do this by simply saying "product". The LLM understands what we *mean* by that word, and looks for that @@ -376,7 +376,7 @@ reviews.llm.translate("review", "spanish") ### Custom prompt It is possible to pass your own prompt to the LLM, and have `mall` run it -against each text entry. Use `llm_custom()` to access this functionality: +against each text entry: ::: {.panel-tabset group="language"} @@ -410,7 +410,14 @@ reviews.llm.custom("review", prompt = my_prompt) ::: -## Initialize session +## Model selection and settings + +You can set the model and its options to use when calling the LLM. In this case, +we refer to options as model specific things that can be set, such as seed or +temperature. + +::: {.panel-tabset group="language"} +## R Invoking an `llm` function will automatically initialize a model selection if you don't have one selected yet. If there is only one option, it will @@ -423,11 +430,6 @@ You can also setup additional arguments that will be passed down to the function that actually runs the prediction. In the case of Ollama, that function is [`chat()`](https://hauselin.github.io/ollama-r/reference/chat.html). - - -::: {.panel-tabset group="language"} -## R - The model to use, and other options can be set for the current R session ```{r} @@ -438,8 +440,8 @@ llm_use("ollama", "llama3.2", seed = 100, temperature = 0) ## python -The model to use, and other options can be set for the specific Polars -data frame +The model and options to be used will be defined at the Polars data frame +object level. If not passed, the default model will be **llama3.2**. ```{python} #| eval: false From a989b129569028b8dbf440c9a7114a5bf3ac3366 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 7 Oct 2024 14:49:16 -0500 Subject: [PATCH 55/57] Notes Databricks is R only for site --- _quarto.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_quarto.yml b/_quarto.yml index 90c5d59..f5d5837 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -27,7 +27,7 @@ website: contents: - text: "Caching" href: articles/caching.qmd - - text: "Databricks" + - text: "Databricks (R only)" href: articles/databricks.qmd format: From 87cb1ec734567d606317185155032f9501a14246 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 7 Oct 2024 14:50:09 -0500 Subject: [PATCH 56/57] Temporarily disables CI tests --- .github/workflows/R-CMD-check.yaml | 4 ++-- .github/workflows/test-coverage.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 960d062..58ffbec 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: main + branches: temp pull_request: - branches: main + branches: temp name: R-CMD-check.yaml diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index cf09f1b..0a6aa3d 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: main + branches: temp pull_request: - branches: main + branches: temp name: test-coverage.yaml From 16003a9326fe90f3ad118a87a043837d8136ec43 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 7 Oct 2024 15:26:17 -0500 Subject: [PATCH 57/57] Adds stub README --- README.md | 13 +++++++++++++ index.qmd | 8 ++++---- 2 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..6fd398a --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# mall + + + + +Run multiple LLM predictions against a data frame. The predictions are +processed row-wise over a specified column. It works using a +pre-determined one-shot prompt, along with the current row’s content. +`mall` is now available in both R and Python. + +To find out how to install and use, or just to learn more about it, please +visit the official website: https://edgararuiz.github.io/mall/ + diff --git a/index.qmd b/index.qmd index f8cf4e8..f2d0f9b 100644 --- a/index.qmd +++ b/index.qmd @@ -25,8 +25,8 @@ mall::llm_use("ollama", "llama3.2", seed = 100, .cache = "_readme_cache") Run multiple LLM predictions against a data frame. The predictions are processed row-wise over a specified column. It works using a pre-determined one-shot prompt, -along with the current row's content. The prompt that is use will depend of the -type of analysis needed. +along with the current row's content. `mall` has been implemented for both R +and Python. The prompt that is use will depend of the type of analysis needed. Currently, the included prompts perform the following: @@ -84,12 +84,12 @@ corporate network. ::: {.panel-tabset group="language"} ## R ```r -pak::pak("edgararuiz/mall/r@python") +pak::pak("edgararuiz/mall/r") ``` ## python ```python -pip install "mall @ git+https://git@github.com/edgararuiz/mall.git@python#subdirectory=python" +pip install "mall @ git+https://git@github.com/edgararuiz/mall.git#subdirectory=python" ``` :::