From 57cf82631c23d6073ac83f91774fd4e904db8b20 Mon Sep 17 00:00:00 2001
From: Jonathan Karlsen <JONAK@equinor.com>
Date: Thu, 15 Aug 2024 08:53:00 +0200
Subject: [PATCH] Add script for generating bigpoly test-data

---
 .gitignore                           |   3 +
 test-data/bigpoly/generate_eclsum.py | 460 +++++++++++++++++++++++++++
 test-data/bigpoly/make_bigpoly.sh    | 115 +++++++
 3 files changed, 578 insertions(+)
 create mode 100644 test-data/bigpoly/generate_eclsum.py
 create mode 100755 test-data/bigpoly/make_bigpoly.sh

diff --git a/.gitignore b/.gitignore
index 5d2cf617fdc..bb4dd25bff5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,3 +34,6 @@ src/ert/shared/version.py
 /compile_commands.json
 *_pb2.py
 *_pb2.pyi
+/test-data/bigpoly/*
+!/test-data/bigpoly/make_bigpoly.sh
+!/test-data/bigpoly/generate_eclsum.py
diff --git a/test-data/bigpoly/generate_eclsum.py b/test-data/bigpoly/generate_eclsum.py
new file mode 100644
index 00000000000..920af9e0294
--- /dev/null
+++ b/test-data/bigpoly/generate_eclsum.py
@@ -0,0 +1,460 @@
+import os
+import sys
+from dataclasses import astuple, dataclass
+from datetime import datetime, timedelta
+from enum import Enum, unique
+from typing import Any, List, Optional, Tuple
+
+try:
+    import resfo as ecl_data_io
+except ImportError:
+    import ecl_data_io
+
+import hypothesis.strategies as st
+import numpy as np
+from hypothesis import HealthCheck, assume, given, settings
+from hypothesis.extra.numpy import from_dtype
+from pydantic import PositiveInt, conint
+from typing_extensions import Self
+
+UNIT_NAMES = st.sampled_from(
+    ["SM3/DAY", "BARSA", "SM3/SM3", "FRACTION", "DAYS", "YEARS", "SM3", "SECONDS"]
+)
+
+NAMES = st.text(
+    min_size=8, max_size=8, alphabet=st.characters(min_codepoint=65, max_codepoint=90)
+)
+
+
+@unique
+class UnitSystem(Enum):
+    METRIC = 1
+    FIELD = 2
+    LAB = 3
+
+    def to_ecl(self):
+        return self.value
+
+
+@unique
+class Simulator(Enum):
+    ECLIPSE_100 = 100
+    ECLIPSE_300 = 300
+    ECLIPSE_300_THERMAL = 500
+    INTERSECT = 700
+    FRONTSIM = 800
+
+    def to_ecl(self):
+        return self.value
+
+
+@dataclass
+class SmspecIntehead:
+    unit: UnitSystem
+    simulator: Simulator
+
+    def to_ecl(self) -> List[Any]:
+        return [value.to_ecl() for value in astuple(self)]
+
+
+@dataclass
+class Date:
+    day: conint(ge=1, le=31)
+    month: conint(ge=1, le=12)
+    year: conint(gt=1901, lt=2038)
+    hour: conint(ge=0, lt=24)
+    minutes: conint(ge=0, lt=60)
+    micro_seconds: conint(ge=0, lt=60000000)
+
+    def to_ecl(self):
+        return astuple(self)
+
+    def to_datetime(self) -> datetime:
+        return datetime(
+            year=self.year,
+            month=self.month,
+            day=self.day,
+            hour=self.hour,
+            minute=self.minutes,
+            second=self.micro_seconds // 10**6,
+            microsecond=self.micro_seconds % 10**6,
+        )
+
+    @classmethod
+    def from_datetime(cls, dt: datetime) -> Self:
+        return cls(
+            year=dt.year,
+            month=dt.month,
+            day=dt.day,
+            hour=dt.hour,
+            minutes=dt.minute,
+            micro_seconds=dt.second * 10**6 + dt.microsecond,
+        )
+
+
+@dataclass
+class Smspec:
+    intehead: SmspecIntehead
+    restart: str
+    num_keywords: PositiveInt
+    nx: PositiveInt
+    ny: PositiveInt
+    nz: PositiveInt
+    restarted_from_step: PositiveInt
+    keywords: List[str]
+    well_names: List[str]
+    region_numbers: List[int]
+    units: List[str]
+    start_date: Date
+    lgr_names: Optional[List[str]] = None
+    lgrs: Optional[List[str]] = None
+    numlx: Optional[List[PositiveInt]] = None
+    numly: Optional[List[PositiveInt]] = None
+    numlz: Optional[List[PositiveInt]] = None
+
+    def to_ecl(self) -> List[Tuple[str, Any]]:
+        # The restart field contains 9 strings of length 8 which
+        # should contain the name of the file restarted from.
+        # If shorter than 72 characters (most likely), the rest
+        # are spaces. (opm manual table F.44, keyword name RESTART)
+        restart = self.restart.ljust(72, " ")
+        restart_list = [restart[i * 8 : i * 8 + 8] for i in range(9)]
+        return [
+            ("INTEHEAD", np.array(self.intehead.to_ecl(), dtype=np.int32)),
+            ("RESTART ", restart_list),
+            (
+                "DIMENS  ",
+                np.array(
+                    [
+                        self.num_keywords,
+                        self.nx,
+                        self.ny,
+                        self.nz,
+                        0,
+                        self.restarted_from_step,
+                    ],
+                    dtype=np.int32,
+                ),
+            ),
+            ("KEYWORDS", [kw.ljust(8) for kw in self.keywords]),
+            ("WGNAMES ", self.well_names),
+            ("LGRS    ", self.lgrs),
+            ("NUMLX   ", self.numlx),
+            ("NUMLY   ", self.numly),
+            ("NUMLZ   ", self.numlz),
+            ("LGRNAMES", self.lgr_names),
+            ("NUMS    ", np.array(self.region_numbers, dtype=np.int32)),
+            ("UNITS   ", self.units),
+            ("STARTDAT", np.array(self.start_date.to_ecl(), dtype=np.int32)),
+        ]
+
+    def to_file(
+        self, filelike, file_format: ecl_data_io.Format = ecl_data_io.Format.UNFORMATTED
+    ):
+        ecl_data_io.write(filelike, self.to_ecl(), file_format)
+
+
+positives = from_dtype(np.dtype(np.int32), min_value=1, max_value=10000)
+small_ints = from_dtype(np.dtype(np.int32), min_value=1, max_value=10)
+
+
+@st.composite
+def smspecs(
+    draw,
+    sum_keys,
+    start_date,
+):
+    """
+    Strategy for smspec that ensures that the TIME parameter, as required by
+    ert, is in the parameters list.
+    """
+    sum_keys = draw(sum_keys)
+    n = len(sum_keys) + 1
+    nx = draw(small_ints)
+    ny = draw(small_ints)
+    nz = draw(small_ints)
+    keywords = ["TIME    "] + sum_keys
+    units = ["DAYS    "] + draw(st.lists(UNIT_NAMES, min_size=n - 1, max_size=n - 1))
+    well_names = [":+:+:+:+"] + draw(st.lists(NAMES, min_size=n - 1, max_size=n - 1))
+    lgrs = draw(st.lists(NAMES, min_size=n, max_size=n))
+    numlx = draw(st.lists(small_ints, min_size=n, max_size=n))
+    numly = draw(st.lists(small_ints, min_size=n, max_size=n))
+    numlz = draw(st.lists(small_ints, min_size=n, max_size=n))
+    lgr_names = list(set(lgrs))
+    region_numbers = [-32676] + draw(
+        st.lists(
+            from_dtype(np.dtype(np.int32), min_value=1, max_value=nx * ny * nz),
+            min_size=len(sum_keys),
+            max_size=len(sum_keys),
+        )
+    )
+    return draw(
+        st.builds(
+            Smspec,
+            nx=st.just(nx),
+            ny=st.just(ny),
+            nz=st.just(nz),
+            # restarted_from_step is hardcoded to 0 because
+            # of a bug in enkf_obs where it assumes that
+            # ecl_sum_get_first_report_step is always 1
+            restarted_from_step=st.just(0),
+            num_keywords=st.just(n),
+            restart=NAMES,
+            keywords=st.just(keywords),
+            well_names=st.just(well_names),
+            lgrs=st.just(lgrs),
+            numlx=st.just(numlx),
+            numly=st.just(numly),
+            numlz=st.just(numlz),
+            lgr_names=st.just(lgr_names),
+            region_numbers=st.just(region_numbers),
+            units=st.just(units),
+            start_date=start_date,
+        )
+    )
+
+
+@dataclass
+class SummaryMiniStep:
+    mini_step: int
+    params: List[float]
+
+    def to_ecl(self):
+        return [
+            ("MINISTEP", np.array([self.mini_step], dtype=np.int32)),
+            ("PARAMS  ", np.array(self.params, dtype=np.float32)),
+        ]
+
+
+@dataclass
+class SummaryStep:
+    seqnum: int
+    ministeps: List[SummaryMiniStep]
+
+    def to_ecl(self):
+        return [("SEQHDR  ", np.array([self.seqnum], dtype=np.int32))] + [
+            i for ms in self.ministeps for i in ms.to_ecl()
+        ]
+
+
+@dataclass
+class Unsmry:
+    steps: List[SummaryStep]
+
+    def to_ecl(self):
+        return [i for step in self.steps for i in step.to_ecl()]
+
+    def to_file(
+        self, filelike, file_format: ecl_data_io.Format = ecl_data_io.Format.UNFORMATTED
+    ):
+        ecl_data_io.write(filelike, self.to_ecl(), file_format)
+
+
+@st.composite
+def summaries(draw):
+    sum_keys = [
+        "AAQT",
+        "AAQT",
+        "BAPI",
+        "BAPI",
+        "BOIT",
+        "BOSAT",
+        "BPR",
+        "BPR",
+        "BTIT",
+        "CTIT",
+        "FAQR",
+        "FAQR",
+        "FGIT",
+        "FGPT",
+        "FGPT",
+        "FLOOK",
+        "FLOWI",
+        "FLOWJ",
+        "FLOWK",
+        "FLOWK",
+        "FPPT",
+        "FPR",
+        "FPR",
+        "FPR",
+        "FPR",
+        "FTFT",
+        "GKRR",
+        "GKRZ",
+        "GKRZ",
+        "GKRZ-",
+        "GLIR",
+        "GPPR",
+        "GPPT",
+        "GWFT",
+        "LBPFR",
+        "LBTIT",
+        "LCPFT",
+        "LCVPT",
+        "LWOIT",
+        "LWOPR",
+        "MAXDPR",
+        "MAXDPR",
+        "MAXDSG",
+        "MAXDSO",
+        "NAIMFRAC",
+        "NAIMFRAC",
+        "NALQT",
+        "NBVIT",
+        "NCGIR",
+        "NCLIT",
+        "NFGFR",
+        "NFVFR",
+        "NFVIT",
+        "NGVFR",
+        "NGWPT",
+        "NLBWIT",
+        "NLINSMAX",
+        "NLINSMIN",
+        "NLWGPR",
+        "NLWLPR",
+        "NRVFT",
+        "NSOIR",
+        "NSTPR",
+        "NSVFT",
+        "OKRY",
+        "RANQT",
+        "RCOFT",
+        "RCOPT",
+        "RGFR",
+        "RGOPR",
+        "RGPR",
+        "RGVIT",
+        "RLIR",
+        "RNFT",
+        "RNFT",
+        "ROFR",
+        "ROFR",
+        "ROFR",
+        "ROFR",
+        "ROFR+",
+        "ROFR-",
+        "ROFT+",
+        "ROFT+",
+        "ROFT-",
+        "ROFTG",
+        "RRTFR ",
+        "RSVIR",
+        "RSVIT",
+        "RTFR",
+        "RWF T-",
+        "RWFR",
+        "RWLFR",
+        "SALQ",
+        "SALQ",
+        "SFR",
+        "SFR",
+        "SFR",
+        "SGFR",
+        "SGFR",
+        "SGFRF",
+        "SGFRF",
+        "SGFRS",
+        "SGFRS",
+        "SGFRS",
+        "SGFT",
+        "SGFT",
+        "SGFTA",
+        "SGFTA",
+        "SOPT",
+        "STEPTYPE",
+        "VELGK",
+        "VELOI",
+        "VELOJ",
+        "VELOK",
+        "VELOK",
+        "VELOK",
+        "VELWJ",
+        "WAAQR",
+        "WANQT",
+        "WBHP",
+        "WBHP",
+        "WBHPT",
+        "WBP4",
+        "WBP5",
+        "WBP5",
+        "WFGIT",
+        "WGGPT",
+        "WKRR",
+        "WKRR-",
+        "WKRX",
+        "WKRX",
+        "WKRX",
+        "WKRZ-",
+        "WKRZ-",
+        "WPIO",
+        "WPIO",
+        "WSLIT",
+        "WVIT",
+        "WWCT",
+        "WWCT",
+        "WWCT",
+    ]
+    first_date = datetime.strptime("1999-1-1", "%Y-%m-%d")
+    smspec = draw(
+        smspecs(
+            sum_keys=st.just(sum_keys),
+            start_date=st.just(
+                Date(
+                    year=first_date.year,
+                    month=first_date.month,
+                    day=first_date.day,
+                    hour=first_date.hour,
+                    minutes=first_date.minute,
+                    micro_seconds=first_date.second * 10**6 + first_date.microsecond,
+                )
+            ),
+        )
+    )
+
+    assume(
+        len(set(zip(smspec.keywords, smspec.region_numbers, smspec.well_names)))
+        == len(smspec.keywords)
+    )
+    dates = np.arange(0.0, 50000.0)
+    try:
+        _ = first_date + timedelta(days=max(dates))
+    except (ValueError, OverflowError):  # datetime has a max year
+        print(f"Failed assumption of max_date {max(dates)}, {first_date}")
+        assume(False)
+
+    ds = sorted(dates, reverse=True)
+    steps = []
+    i = 0
+    j = 0
+    while len(ds) > 0:
+        minis = []
+        for _ in range(min(3, len(ds))):
+            data = np.zeros(len(sum_keys) + 1)
+            data[0] = ds.pop()
+            minis.append(SummaryMiniStep(i, data))
+            i += 1
+        steps.append(SummaryStep(j, minis))
+        j += 1
+    return smspec, Unsmry(steps)
+
+
+@settings(
+    suppress_health_check=[
+        HealthCheck.too_slow,
+        HealthCheck.data_too_large,
+        HealthCheck.large_base_example,
+    ],
+    max_examples=1,
+)
+@given(summaries())
+def main(summary):
+    smspec, unsmry = summary
+    path = sys.argv[1]
+    unsmry.to_file(f"{path}.UNSMRY")
+    smspec.to_file(f"{path}.SMSPEC")
+    os._exit(0)
+
+
+if __name__ == "__main__":
+    main()  # pylint: disable=no-value-for-parameter
diff --git a/test-data/bigpoly/make_bigpoly.sh b/test-data/bigpoly/make_bigpoly.sh
new file mode 100755
index 00000000000..c879f681fc5
--- /dev/null
+++ b/test-data/bigpoly/make_bigpoly.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cp -a ../poly_example/ .
+polystub=poly_stub.ert
+touch $polystub
+
+# shellcheck disable=SC2130 # style
+echo "JOBNAME bigpoly_%d" >> $polystub
+echo "RUNPATH poly_out/realization-<IENS>/iter-<ITER>" >> $polystub
+echo "OBS_CONFIG observations" >> $polystub
+echo "MAX_SUBMIT 1" >> $polystub
+echo "GEN_KW COEFFS coeff_priors" >> $polystub
+echo "GEN_DATA POLY_RES RESULT_FILE:poly.out" >> $polystub
+echo "INSTALL_JOB poly_eval POLY_EVAL" >> $polystub
+
+echo "EXECUTABLE env" > ENV
+echo "INSTALL_JOB env ENV" >> $polystub
+
+
+###########################################
+echo "Making small poly"
+
+polysmall=smallpoly.ert
+cp $polystub $polysmall
+echo "NUM_REALIZATIONS 2" >> $polysmall
+echo "SIMULATION_JOB poly_eval" >> $polysmall
+
+polysmall_local=smallpoly_local.ert
+
+cp $polysmall $polysmall_local
+echo "QUEUE_SYSTEM LOCAL" >> $polysmall_local
+echo "QUEUE_OPTION LOCAL MAX_RUNNING 10" >> $polysmall_local
+
+
+###########################################
+echo "Making small poly with FIELD update"
+
+polyfile_field=smallpoly_field.ert
+cp $polystub $polyfile_field
+
+# shellcheck disable=SC2129 # style
+echo "NUM_REALIZATIONS 2" >> $polyfile_field
+echo "INSTALL_JOB symlink_grdecl SYMLINK_GRDECL" >> $polyfile_field
+echo "SIMULATION_JOB poly_eval" >> $polyfile_field
+echo "SIMULATION_JOB symlink_grdecl" >> $polyfile_field
+echo "GRID ERTBOX.EGRID" >> $polyfile_field
+echo "FIELD F_PARAM PARAMETER fieldparam.grdecl INIT_FILES:fieldparam.grdecl INIT_TRANSFORM:LOG OUTPUT_TRANSFORM:EXP MIN:-5.5 MAX:5.5 FORWARD_INIT:True" >> $polyfile_field
+
+cat > make_egrid.py << EOF
+import xtgeo
+grid = xtgeo.create_box_grid(dimension=(200, 200, 200))
+grid.to_file("ERTBOX.EGRID", "egrid")
+EOF
+
+python make_egrid.py
+
+cat > make_random_grdecl.py << EOF
+#!/bin/env python
+import numpy as np
+values = np.random.uniform(size=200*200*200)
+with open("fieldparam.grdecl", "w", encoding="utf-8") as filehandle:
+    filehandle.write("F_PARAM\n")
+    filehandle.write(" ".join([str(val) for val in values]) + " \n/\n")
+EOF
+python make_random_grdecl.py
+# This produces a file fieldparam.grdecl which we can use as a
+# static file in all realizations by symlinking
+
+echo "#!/bin/bash" > symlink_grdecl.sh
+echo "ln -sf ../../../fieldparam.grdecl" >> symlink_grdecl.sh
+chmod a+x symlink_grdecl.sh
+echo "EXECUTABLE symlink_grdecl.sh" > SYMLINK_GRDECL
+
+
+polyfield_local=smallpoly_field_local.ert
+cp smallpoly_field.ert $polyfield_local
+
+echo "QUEUE_SYSTEM LOCAL" >> $polyfield_local
+echo "QUEUE_OPTION LOCAL MAX_RUNNING 10" >> $polyfield_local
+
+
+###########################################
+echo "Generate random summary file"
+
+python3 -m venv venv
+source venv/bin/activate
+pip install hypothesis resfo pydantic
+python generate_eclsum.py BIGPOLY
+
+
+#####################################################################################
+echo "Make bigpoly (many realizations, many forward_models and with summary-file)"
+
+polybig=bigpoly.ert
+
+cp poly_stub.ert $polybig
+
+num_real=100
+
+# shellcheck disable=SC2129 # style
+echo "NUM_REALIZATIONS $num_real" >> $polybig
+echo "ECLBASE BIGPOLY" >> $polybig
+echo "SUMMARY *" >> $polybig
+echo "FORWARD_MODEL COPY_FILE(<FROM>=<CONFIG_PATH>/BIGPOLY.SMSPEC,<TO>=BIGPOLY.SMSPEC)" >> $polybig
+echo "FORWARD_MODEL COPY_FILE(<FROM>=<CONFIG_PATH>/BIGPOLY.UNSMRY,<TO>=BIGPOLY.UNSMRY)" >> $polybig
+
+for _ in $(seq 0 $num_real); do
+  echo "SIMULATION_JOB env" >> $polybig
+done
+
+echo "SIMULATION_JOB poly_eval" >> $polybig
+
+for _ in $(seq 0 $num_real); do
+  echo "SIMULATION_JOB env" >> $polybig
+done