Skip to content

Commit

Permalink
Merge pull request #1160 from metno/cams-obs-mem
Browse files Browse the repository at this point in the history
reduce memory consumption of cams283 obsreader
  • Loading branch information
heikoklein authored May 7, 2024
2 parents eaecbe0 + dcc6532 commit 627c580
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 13 deletions.
6 changes: 5 additions & 1 deletion pyaerocom/io/cams2_83/obs.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ def poll_names(df: pd.DataFrame) -> pd.DataFrame:
return df.assign(poll=poll)


def read_csv(path: str | Path, *, domain: Domain = CAMS2_50_DOMAIN) -> pd.DataFrame:
def read_csv(
path: str | Path, *, domain: Domain = CAMS2_50_DOMAIN, polls: list[str] = None
) -> pd.DataFrame:
df = pd.read_csv(
path,
sep=";",
Expand All @@ -61,6 +63,8 @@ def read_csv(path: str | Path, *, domain: Domain = CAMS2_50_DOMAIN) -> pd.DataFr
usecols=lambda x: x != "_",
)
df = df.pipe(add_time).pipe(conc_units).pipe(poll_names)
if polls is not None:
df = df[df.poll.isin(polls)]
if not in_domain(df, domain=domain).all():
logger.warning("found obs outside the model domain")
df = df[in_domain(df, domain=domain)]
Expand Down
36 changes: 25 additions & 11 deletions pyaerocom/io/cams2_83/read_obs.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,17 @@ def read(
first_file: int | None = None,
last_file: int | None = None,
) -> UngriddedData:
"""Read observations as ungridded
:param vars_to_retrieve: pyaerocom-variables to read, defaults to None
None meaning all default variables
:param files: files to read, defaults to None
None meaning all known files in the date-range
:param first_file: first file to process from from files, defaults to None=0
:param last_file: last file to process from files, defaults to None=-1
:raises TypeError: wrong input type
:return: ungridded data object
"""
if vars_to_retrieve is None:
vars_to_retrieve = self.DEFAULT_VARS
if isinstance(vars_to_retrieve, str):
Expand All @@ -89,21 +100,24 @@ def read(
files = files[:last_file]

start = time.time()
data = list(self.__reader(vars_to_retrieve, files))
end = time.time()
print(end - start)

ungriddeddata = UngriddedData.from_station_data(data)
print(time.time() - end, (time.time() - end) / 60.0)
logger.info(f"Start read obs")
# lazy data_iterator returns immediately, unpacked in from_station_data
data_iterator = self.__reader(vars_to_retrieve, files)
ungriddeddata = UngriddedData.from_station_data(data_iterator)
logger.info(f"Time needed to convert obs to ungridded: {time.time() - start}s")
return ungriddeddata

def read_file(self, filename, vars_to_retrieve=None):
return self.read(vars_to_retrieve, [filename])

@classmethod
def __reader(cls, vars_to_retrieve: list[str], files: list[str | Path]) -> Iterator[dict]:
logger.debug(f"reading {cls.DATA_ID} {vars_to_retrieve=} from {files=}")
data = pd.concat(read_csv(path) for path in files).drop_duplicates(
logger.info(f"reading {cls.DATA_ID} {vars_to_retrieve=}")
logger.debug(f"reading from {files=}")
reverse_aerocom = {v: k for k, v in AEROCOM_NAMES.items()}
polls = [reverse_aerocom[v] for v in vars_to_retrieve]

data = pd.concat(read_csv(path, polls=polls) for path in files).drop_duplicates(
subset=["station", "poll", "time"]
)
df: pd.DataFrame
Expand All @@ -115,8 +129,8 @@ def __reader(cls, vars_to_retrieve: list[str], files: list[str | Path]) -> Itera
latitude=df["lat"].iloc[0],
longitude=df["lon"].iloc[0],
altitude=df["alt"].iloc[0],
variables=cls.DEFAULT_VARS,
var_info=dict.fromkeys(cls.DEFAULT_VARS, dict(units="ug m-3")),
variables=vars_to_retrieve,
var_info=dict.fromkeys(vars_to_retrieve, dict(units="ug m-3")),
data_id=cls.DATA_ID,
ts_type=cls.TS_TYPE,
)
Expand All @@ -128,6 +142,6 @@ def __reader(cls, vars_to_retrieve: list[str], files: list[str | Path]) -> Itera
for poll in missing:
df[poll] = np.nan
df = df.rename(AEROCOM_NAMES, axis="columns")
for poll in cls.DEFAULT_VARS:
for poll in vars_to_retrieve:
output[poll] = df[poll]
yield output
2 changes: 1 addition & 1 deletion pyaerocom/ungriddeddata.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def from_station_data(stats, add_meta_keys=None):
Parameters
----------
stats : list or StationData
stats : iterator or StationData
input data object(s)
add_meta_keys : list, optional
list of metadata keys that are supposed to be imported from the
Expand Down

0 comments on commit 627c580

Please sign in to comment.