rapidsai · rapids-bot · Jan 17, 2024 · Aug 9, 2023 · Aug 9, 2023 · Aug 9, 2023
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -40,8 +40,8 @@ popd
 
 rapids-logger "Build Python docs"
 pushd docs/cudf
-make dirhtml
-make text
+make dirhtml O="-j 4"
+make text O="-j 4"
 mkdir -p "${RAPIDS_DOCS_DIR}/cudf/"{html,txt}
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html"
 mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"

@@ -12,6 +12,7 @@ dependencies:
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
+- breathe>=4.35.0
 - c-compiler
 - cachetools
 - clang-tools=16.0.6

@@ -12,6 +12,7 @@ dependencies:
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
+- breathe>=4.35.0
 - c-compiler
 - cachetools
 - clang-tools=16.0.6

@@ -464,9 +464,9 @@ the host (`to_host`).
 
 ### Background
 
-libcudf employs a custom-built [preload library
-docs](https://man7.org/linux/man-pages/man8/ld.so.8.html) to validate its internal stream usage (the
-code may be found
+libcudf employs a custom-built [preload
+library](https://man7.org/linux/man-pages/man8/ld.so.8.html) to validate its internal stream usage
+(the code may be found
 [`here`](https://github.com/rapidsai/cudf/blob/main/cpp/tests/utilities/identify_stream_usage.cpp)).
 This library wraps every asynchronous CUDA runtime API call that accepts a stream with a check to
 ensure that the passed CUDA stream is a valid one, immediately throwing an exception if an invalid

@@ -106,7 +106,7 @@ class strings_column_view : private column_view {
   /**
    * @brief Returns the internal column of chars
    *
-   * @throw cudf::logic error if this is an empty column
+   * @throw cudf::logic_error if this is an empty column
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return The chars column
    */

@@ -457,6 +457,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
+          - breathe>=4.35.0
           - dask-cuda==24.2.*
           - *doxygen
           - make

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # cudf documentation build configuration file, created by
 # sphinx-quickstart on Wed May  3 10:59:22 2017.
@@ -16,11 +16,33 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
+import glob
 import os
+import re
 import sys
+import xml.etree.ElementTree as ET
 
 from docutils.nodes import Text
 from sphinx.addnodes import pending_xref
+from sphinx.highlighting import lexers
+from sphinx.ext import intersphinx
+from pygments.lexer import RegexLexer
+from pygments.token import Text as PText
+
+
+class PseudoLexer(RegexLexer):
+    """Trivial lexer for pseudocode."""
+
+    name = 'pseudocode'
+    aliases = ['pseudo']
+    tokens = {
+        'root': [
+            (r'.*\n', PText),
+        ]
+    }
+
+
+lexers['pseudo'] = PseudoLexer()
 
 # -- Custom Extensions ----------------------------------------------------
 sys.path.append(os.path.abspath("./_ext"))
@@ -35,6 +57,7 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
+    "breathe",
     "sphinx.ext.intersphinx",
     "sphinx.ext.autodoc",
     "sphinx.ext.autosummary",
@@ -46,6 +69,67 @@
     "myst_nb",
 ]
 
+# Preprocess doxygen xml for compatibility with latest Breathe
+def clean_definitions(root):
+    # Breathe can't handle SFINAE properly:
+    # https://github.com/breathe-doc/breathe/issues/624
+    seen_ids = set()
+    for sectiondef in root.findall(".//sectiondef"):
+        for memberdef in sectiondef.findall("./memberdef"):
+            id_ = memberdef.get("id")
+            for tparamlist in memberdef.findall("./templateparamlist"):
+                for param in tparamlist.findall("./param"):
+                    for type_ in param.findall("./type"):
+                        # CUDF_ENABLE_IF or std::enable_if
+                        if "enable_if" in ET.tostring(type_).decode().lower():
+                            if id_ not in seen_ids:
+                                # If this is the first time we're seeing this function,
+                                # just remove the template parameter.
+                                seen_ids.add(id_)
+                                tparamlist.remove(param)
+                            else:
+                                # Otherwise, remove the overload altogether and just
+                                # rely on documenting one of the SFINAE overloads.
+                                sectiondef.remove(memberdef)
+                            break
+
+                        # In addition to enable_if, check for overloads set up by
+                        # ...*=nullptr.
+                        for type_ in param.findall("./defval"):
+                            if "nullptr" in ET.tostring(type_).decode():
+                                try:
+                                    tparamlist.remove(param)
+                                except ValueError:
+                                    # May have already been removed in above,
+                                    # so skip.
+                                    pass
+                                break
+
+
+    # All of these in type declarations cause Breathe to choke.
+    # For friend, see https://github.com/breathe-doc/breathe/issues/916
+    strings_to_remove = ("__forceinline__", "CUDF_HOST_DEVICE", "decltype(auto)", "friend")
+    for field in (".//type", ".//definition"):
+        for type_ in root.findall(field):
+            if type_.text is not None:
+                for string in strings_to_remove:
+                    type_.text = type_.text.replace(string, "")
+
+
+def clean_all_xml_files(path):
+    for fn in glob.glob(os.path.join(path, "*.xml")):
+        tree = ET.parse(fn)
+        clean_definitions(tree.getroot())
+        tree.write(fn)
+
+
+# Breathe Configuration
+breathe_projects = {"libcudf": "../../../cpp/doxygen/xml"}
+for project_path in breathe_projects.values():
+    clean_all_xml_files(project_path)
+breathe_default_project = "libcudf"
+
+
 nb_execution_excludepatterns = ['performance-comparisons.ipynb']
 
 nb_execution_mode = "force"
@@ -195,11 +279,13 @@
 
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
-    "python": ("https://docs.python.org/3", None),
     "cupy": ("https://docs.cupy.dev/en/stable/", None),
+    "dlpack": ("https://dmlc.github.io/dlpack/latest/", None),
     "numpy": ("https://numpy.org/doc/stable", None),
-    "pyarrow": ("https://arrow.apache.org/docs/", None),
     "pandas": ("https://pandas.pydata.org/docs/", None),
+    "pyarrow": ("https://arrow.apache.org/docs/", None),
+    "python": ("https://docs.python.org/3", None),
+    "rmm": ("https://docs.rapids.ai/api/rmm/nightly/", None),
     "typing_extensions": ("https://typing-extensions.readthedocs.io/en/stable/", None),
 }
 
@@ -238,14 +324,170 @@ def resolve_aliases(app, doctree):
             text_node.parent.replace(text_node, Text(text_to_render, ""))
 
 
-def ignore_internal_references(app, env, node, contnode):
-    name = node.get("reftarget", None)
-    if name == "cudf.core.index.GenericIndex":
+def _generate_namespaces(namespaces):
+    all_namespaces = []
+    for base_namespace, other_namespaces in namespaces.items():
+        all_namespaces.append(base_namespace + "::")
+        for other_namespace in other_namespaces:
+            all_namespaces.append(f"{other_namespace}::")
+            all_namespaces.append(f"{base_namespace}::{other_namespace}::")
+    return all_namespaces
+
+_all_namespaces = _generate_namespaces({
+    # Note that io::datasource is actually a nested class
+    "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression"},
+    "numeric": {},
+    "nvtext": {},
+})
+
+_names_to_skip = {
+    # External names
+    "thrust",
+    "cuda",
+    "arrow",
+    # Unknown types
+    "int8_t",
+    "int16_t",
+    "int32_t",
+    "int64_t",
+    "__int128_t",
+    "size_t",
+    "uint8_t",
+    "uint16_t",
+    "uint32_t",
+    "uint64_t",
+    # Internal objects
+    "id_to_type_impl",
+    "type_to_scalar_type_impl",
+    "type_to_scalar_type_impl",
+    "detail",
+    # kafka objects
+    "python_callable_type",
+    "kafka_oauth_callback_wrapper_type",
+    # Template types
+    "Radix",
+    # Unsupported by Breathe
+    # https://github.com/breathe-doc/breathe/issues/355
+    "deprecated",
+    # TODO: This type is currently defined in a detail header but it's in
+    # the public namespace. However, it's used in the detail header, so it
+    # needs to be put into a public header that can be shared.
+    "char_utf8",
+    # TODO: This is currently in a src file but perhaps should be public
+    "orc::column_statistics",
+    # Sphinx doesn't know how to distinguish between the ORC and Parquet
+    # definitions because Breathe doesn't to preserve namespaces for enums.
+    "TypeKind",
+}
+
+_domain_objects = None
+_prefixed_domain_objects = None
+_intersphinx_cache = {}
+
+_intersphinx_extra_prefixes = ("rmm", "rmm::mr", "mr")
+
+
+def _cached_intersphinx_lookup(env, node, contnode):
+    """Perform an intersphinx lookup and cache the result.
+
+    Have to manually manage the intersphinx cache because lru_cache doesn't
+    handle the env object properly.
+    """
+    key = (node, contnode)
+    if key in _intersphinx_cache:
+        return _intersphinx_cache[key]
+    if (ref := intersphinx.resolve_reference_detect_inventory(env, node, contnode)) is not None:
+        _intersphinx_cache[key] = ref
+    return ref
+
+
+def on_missing_reference(app, env, node, contnode):
+    # These variables are defined outside the function to speed up the build.
+    global _all_namespaces, _names_to_skip, _intersphinx_extra_prefixes, \
+        _domain_objects, _prefixed_domain_objects, _intersphinx_cache
+
+    # Precompute and cache domains for faster lookups
+    if _domain_objects is None:
+        _domain_objects = {}
+        _prefixed_domain_objects = {}
+        for (name, _, _, docname, _, _) in env.domains["cpp"].get_objects():
+            _domain_objects[name] = docname
+            for prefix in _all_namespaces:
+                _prefixed_domain_objects[f"{prefix}{name}"] = name
+
+    reftarget = node.get("reftarget")
+    if reftarget == "cudf.core.index.GenericIndex":
         # We don't exposed docs for `cudf.core.index.GenericIndex`
         # hence we would want the docstring & mypy references to
         # use `cudf.Index`
         node["reftarget"] = "cudf.Index"
         return contnode
+    if "namespacecudf" in reftarget:
+        node["reftarget"] = "cudf"
+        return contnode
+    if "classcudf_1_1column__device__view_" in reftarget:
+        node["reftarget"] = "cudf::column_device_view"
+        return contnode
+
+    if (refid := node.get("refid")) is not None and "hpp" in refid:
+        # We don't want to link to C++ header files directly from the
+        # Sphinx docs, those are pages that doxygen automatically
+        # generates. Adding those would clutter the Sphinx output.
+        return contnode
+
+    if node["refdomain"] in ("std", "cpp") and reftarget is not None:
+        if any(toskip in reftarget for toskip in _names_to_skip):
+            return contnode
+
+        # Strip template parameters and just use the base type.
+        if match := re.search("(.*)<.*>", reftarget):
+            reftarget = match.group(1)
+
+        # Try to find the target prefixed with e.g. namespaces in case that's
+        # all that's missing.
+        # We need to do this search because the call sites may not have used
+        # the namespaces and we don't want to force them to, and we have to
+        # consider both directions because of issues like
+        # https://github.com/breathe-doc/breathe/issues/860
+        # (there may be other related issues, I haven't investigated all
+        # possible combinations of failures in depth).
+        if (name := _prefixed_domain_objects.get(reftarget)) is None:
+            for prefix in _all_namespaces:
+                if f"{prefix}{reftarget}" in _domain_objects:
+                    name = f"{prefix}{reftarget}"
+                    break
+        if name is not None:
+            return env.domains["cpp"].resolve_xref(
+                env,
+                _domain_objects[name],
+                app.builder,
+                node["reftype"],
+                name,
+                node,
+                contnode,
+            )
+
+        # Final possibility is an intersphinx lookup to see if the symbol
+        # exists in one of the other inventories. First we check the symbol
+        # itself in case it was originally templated and that caused the lookup
+        # to fail.
+        if reftarget != node["reftarget"]:
+            node["reftarget"] = reftarget
+            if (ref := _cached_intersphinx_lookup(env, node, contnode)) is not None:
+                return ref
+
+        # If the template wasn't the (only) issue, we check the various
+        # namespace prefixes that may need to be added or removed.
+        for prefix in _intersphinx_extra_prefixes:
+            if prefix not in reftarget:
+                node["reftarget"] = f"{prefix}::{reftarget}"
+                if (ref := _cached_intersphinx_lookup(env, node, contnode)) is not None:
+                    return ref
+            else:
+                node["reftarget"] = reftarget.replace(f"{prefix}::", "")
+                if (ref := _cached_intersphinx_lookup(env, node, contnode)) is not None:
+                    return ref
+
     return None
 
 
@@ -261,4 +503,4 @@ def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
     app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer")
     app.connect("doctree-read", resolve_aliases)
-    app.connect("missing-reference", ignore_internal_references)
+    app.connect("missing-reference", on_missing_reference)
diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst
@@ -29,4 +29,5 @@ other operations.
 
    user_guide/index
    cudf_pandas/index
+   libcudf_docs/index
    developer_guide/index
diff --git a/docs/cudf/source/libcudf_docs/api_docs/aggregation_factories.rst b/docs/cudf/source/libcudf_docs/api_docs/aggregation_factories.rst
@@ -0,0 +1,5 @@
+Aggregation Factories
+=====================
+
+.. doxygengroup:: aggregation_factories
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/aggregation_groupby.rst b/docs/cudf/source/libcudf_docs/api_docs/aggregation_groupby.rst
@@ -0,0 +1,5 @@
+Aggregation Groupby
+===================
+
+.. doxygengroup:: aggregation_groupby
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/aggregation_reduction.rst b/docs/cudf/source/libcudf_docs/api_docs/aggregation_reduction.rst
@@ -0,0 +1,5 @@
+Aggregation Reduction
+=====================
+
+.. doxygengroup:: aggregation_reduction
+   :members: