diff --git a/.github/workflows/cleanup.yml b/.github/workflows/cleanup.yml index 5fd75af..5695c83 100644 --- a/.github/workflows/cleanup.yml +++ b/.github/workflows/cleanup.yml @@ -1,19 +1,93 @@ -# .gitHub/workflows/cleanup.yml -name: Cleanup Deployments +# .github/workflows/cleanup.yml +name: Repository Cleanup on: - workflow_dispatch: # 允許手動觸發 + workflow_dispatch: + inputs: + action_type: + description: '選擇要執行的操作' + required: true + type: choice + options: + - 'Cleanup Workflow' + - 'Cleanup Deployments' + workflow_status: + description: '要清理的工作流程狀態 (僅在選擇 Cleanup Workflow 時需要)' + required: false + type: choice + options: + - 'disabled' # 已停用的工作流程 + - 'active' # 活躍的工作流程 + - 'all' # 所有工作流程 + environment: + description: '要清理的部署環境 (僅在選擇 Cleanup Deployments 時需要)' + required: false + type: choice + options: + - 'all' + - 'github-pages' + - 'pypi' jobs: - cleanup: + cleanup-workflows: + if: ${{ github.event.inputs.action_type == 'Cleanup Workflow' }} + runs-on: ubuntu-latest + permissions: + actions: write + steps: + - name: Cleanup workflows + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const status = '${{ github.event.inputs.workflow_status }}'; + console.log(`Cleaning up workflows with status: ${status}`); + + // 獲取所有工作流程 + const workflows = await github.rest.actions.listRepoWorkflows({ + owner: context.repo.owner, + repo: context.repo.repo + }); + + for (const workflow of workflows.data.workflows) { + // 根據選擇的狀態過濾工作流程 + if (status === 'all' || + (status === 'disabled' && !workflow.state === 'active') || + (status === 'active' && workflow.state === 'active')) { + + console.log(`Processing workflow: ${workflow.name} (${workflow.state})`); + + // 獲取此工作流程的所有運行 + const runs = await github.rest.actions.listWorkflowRuns({ + owner: context.repo.owner, + repo: context.repo.repo, + workflow_id: workflow.id, + }); + + // 刪除運行 + console.log(`Found ${runs.data.total_count} runs to delete`); + for (const run of runs.data.workflow_runs) { + console.log(`Deleting run #${run.run_number} of ${workflow.name}`); + await github.rest.actions.deleteWorkflowRun({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: run.id + }); + } + } + } + console.log('Cleanup completed'); + + cleanup-deployments: + if: ${{ github.event.inputs.action_type == 'Cleanup Deployments' }} runs-on: ubuntu-latest permissions: deployments: write actions: write contents: write - steps: - name: Delete github-pages deployments + if: ${{ github.event.inputs.environment == 'github-pages' || github.event.inputs.environment == 'all' }} uses: strumwolf/delete-deployment-environment@v2 with: token: ${{ secrets.GITHUB_TOKEN }} @@ -21,8 +95,9 @@ jobs: onlyRemoveDeployments: true - name: Delete pypi deployments + if: ${{ github.event.inputs.environment == 'pypi' || github.event.inputs.environment == 'all' }} uses: strumwolf/delete-deployment-environment@v2 with: token: ${{ secrets.GITHUB_TOKEN }} environment: pypi - onlyRemoveDeployments: true + onlyRemoveDeployments: true \ No newline at end of file diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index a91cd84..99602c0 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,54 +1,135 @@ -# .gitHub/workflows/publish.yml -name: Publish to PyPI +name: Publish AeroViz on: push: tags: - - 'v*' # 當推送版本標籤時觸發,如 v0.1.0 + - 'v*' jobs: - build-and-publish: + build-and-test: + strategy: + matrix: + python-version: [ "3.11", "3.12" ] runs-on: ubuntu-latest - environment: - name: pypi steps: - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 with: - python-version: '3.x' + python-version: ${{ matrix.python-version }} + cache: 'pip' - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel build twine + pip install setuptools wheel build + pip install -e . + pip install -e ".[test]" - - name: Extract version from tag - id: get_version - run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV + - name: Run tests + run: | + pytest tests/ -m "not requires_data" - - name: Verify version matches + - name: Verify package version matches tag run: | - VERSION=$(python setup.py --version) - if [ "$VERSION" != "${{ env.VERSION }}" ]; then - echo "Version mismatch: Tag version (${{ env.VERSION }}) doesn't match package version ($VERSION)" + TAG_VERSION=${GITHUB_REF#refs/tags/v} + PACKAGE_VERSION=$(python setup.py --version) + + if [ "$PACKAGE_VERSION" != "$TAG_VERSION" ]; then + echo "Version mismatch:" + echo " - Tag version: $TAG_VERSION" + echo " - Package version: $PACKAGE_VERSION" exit 1 + else + echo "Version match: $TAG_VERSION" fi - name: Build package run: python -m build - - name: Publish to Test PyPI - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }} - run: | - twine upload --repository testpypi dist/* + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: python-package-distributions-${{ matrix.python-version }} + path: dist/ + + publish-test: + needs: build-and-test + runs-on: ubuntu-latest + environment: + name: test-pypi + url: https://test.pypi.org/p/AeroViz + permissions: + id-token: write + + steps: + # Download artifacts from Python 3.12 build only + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: python-package-distributions-3.12 + path: dist/ + + - name: Publish to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ + + publish-prod: + needs: publish-test + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/AeroViz + permissions: + id-token: write + + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: python-package-distributions-3.12 + path: dist/ - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + + github-release: + name: Create GitHub Release + needs: publish-prod + runs-on: ubuntu-latest + permissions: + contents: write + id-token: write + + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: python-package-distributions-3.12 + path: dist/ + + - name: Sign the dists with Sigstore + uses: sigstore/gh-action-sigstore-python@v2.1.1 + with: + inputs: >- + ./dist/*.tar.gz + ./dist/*.whl + + - name: Create GitHub Release env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} - run: | - twine upload dist/* \ No newline at end of file + GITHUB_TOKEN: ${{ github.token }} + run: >- + gh release create + '${{ github.ref_name }}' + --repo '${{ github.repository }}' + --notes "Release ${{ github.ref_name }}" + + - name: Upload artifacts to GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + run: >- + gh release upload + '${{ github.ref_name }}' dist/** + --repo '${{ github.repository }}' \ No newline at end of file diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 3265745..99829c3 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -2,30 +2,48 @@ name: Python Tests on: push: - branches: [ main ] + branches: [ main, master ] pull_request: - branches: [ main ] + branches: [ main, master ] jobs: test: - runs-on: ubuntu-latest + strategy: + matrix: + python-version: [ "3.11", "3.12" ] + os: [ ubuntu-latest ] + + fail-fast: false + + runs-on: ${{ matrix.os }} steps: - name: Checkout code uses: actions/checkout@v4 - - name: Set up Python 3.12 + - name: Set up Python 3.XX uses: actions/setup-python@v5 with: - python-version: "3.12" + python-version: ${{ matrix.python-version }} + cache: 'pip' # 啟用 pip 緩存加速安裝 - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pytest - pip install -r requirements/requirements.txt pip install -e . + pip install -e ".[test]" - - name: Run tests + - name: Run tests with coverage run: | - pytest tests/test_aeroviz_import.py \ No newline at end of file + pytest tests/ -m "not requires_data" \ + --cov=AeroViz \ + --cov-report=term-missing \ + --cov-report=xml \ + -v + + - name: Upload coverage reports + uses: actions/upload-artifact@v4 + with: + name: coverage-report-${{ matrix.python-version }}-${{ github.sha }} + path: coverage.xml + if-no-files-found: error diff --git a/AeroViz/__init__.py b/AeroViz/__init__.py index 6142d50..8063006 100644 --- a/AeroViz/__init__.py +++ b/AeroViz/__init__.py @@ -2,13 +2,12 @@ from AeroViz import plot from AeroViz.dataProcess import DataProcess from AeroViz.rawDataReader import RawDataReader -from AeroViz.tools import DataBase, DataReader, DataClassifier +from AeroViz.tools import DataBase, DataClassifier __all__ = [ 'plot', 'RawDataReader', 'DataProcess', 'DataBase', - 'DataReader', 'DataClassifier' ] diff --git a/AeroViz/plot/optical/PyMieScatt_update.py b/AeroViz/dataProcess/Optical/PyMieScatt_update.py similarity index 98% rename from AeroViz/plot/optical/PyMieScatt_update.py rename to AeroViz/dataProcess/Optical/PyMieScatt_update.py index b2a9a35..d5e6a43 100644 --- a/AeroViz/plot/optical/PyMieScatt_update.py +++ b/AeroViz/dataProcess/Optical/PyMieScatt_update.py @@ -6,13 +6,6 @@ from scipy.special import jv, yv -def coerceDType(d): - if type(d) is not np.ndarray: - return np.array(d) - else: - return d - - def MieQ(m, wavelength, diameter, nMedium=1.0, asDict=False, asCrossSection=False): # http://pymiescatt.readthedocs.io/en/latest/forward.html#MieQ nMedium = nMedium.real @@ -271,8 +264,8 @@ def Mie_SD(m, wavelength, dp, ndp, nMedium=1.0, SMPS=True, interpolate=False, as nMedium = nMedium.real m /= nMedium wavelength /= nMedium - dp = coerceDType(dp) - ndp = coerceDType(ndp) + dp = np.array(dp) + ndp = np.array(ndp) _length = np.size(dp) Q_ext = np.zeros(_length) Q_sca = np.zeros(_length) @@ -373,8 +366,8 @@ def SF_SD(m, wavelength, dp, ndp, nMedium=1.0, minAngle=0, maxAngle=180, angular wavelength /= nMedium _steps = int(1 + (maxAngle - minAngle) / angularResolution) - ndp = coerceDType(ndp) - dp = coerceDType(dp) + ndp = np.array(ndp) + dp = np.array(dp) SL = np.zeros(_steps) SR = np.zeros(_steps) SU = np.zeros(_steps) diff --git a/AeroViz/plot/optical/mie_theory.py b/AeroViz/dataProcess/Optical/mie_theory.py similarity index 100% rename from AeroViz/plot/optical/mie_theory.py rename to AeroViz/dataProcess/Optical/mie_theory.py diff --git a/AeroViz/dataProcess/SizeDistr/prop.py b/AeroViz/dataProcess/SizeDistr/prop.py new file mode 100644 index 0000000..d55a8db --- /dev/null +++ b/AeroViz/dataProcess/SizeDistr/prop.py @@ -0,0 +1,62 @@ +import numpy as np +from numpy import exp, log +from scipy.signal import find_peaks + + +def geometric(dp: np.ndarray, + dist: np.ndarray + ) -> tuple[float, float]: + """ Calculate the geometric mean and standard deviation. """ + + _gmd = (((dist * log(dp)).sum()) / dist.sum()) + + logdp_mesh, gmd_mesh = np.meshgrid(log(dp), _gmd) + _gsd = ((((logdp_mesh - gmd_mesh) ** 2) * dist).sum() / dist.sum()) ** .5 + + return exp(_gmd), exp(_gsd) + + +def contribution(dp: np.ndarray, + dist: np.ndarray + ) -> tuple[float, float, float]: + """ Calculate the relative contribution of each mode. """ + + ultra = dist[(dp >= 11.8) & (dp < 100)].sum() / dist.sum() + accum = dist[(dp >= 100) & (dp < 1000)].sum() / dist.sum() + coars = dist[(dp >= 1000) & (dp < 2500)].sum() / dist.sum() + + return ultra, accum, coars + + +def mode(dp: np.ndarray, + dist: np.ndarray + ) -> np.ndarray: + """ Find three peak mode in distribution. """ + + min_value = np.array([dist.min()]) + mode, _ = find_peaks(np.concatenate([min_value, dist, min_value]), distance=len(dist) - 1) + + return dp[mode - 1] + + +def properties(dist, + dp: np.ndarray, + dlogdp: np.ndarray, + weighting: str + ) -> dict: + """ for apply """ + dist = np.array(dist) + + gmd, gsd = geometric(dp, dist) + ultra, accum, coarse = contribution(dp, dist) + peak = mode(dp, dist) + + return {key: round(value, 3) for key, value in + {f'total_{weighting}': (dist * dlogdp).sum(), + f'GMD_{weighting}': gmd, + f'GSD_{weighting}': gsd, + f'mode_{weighting}': peak[0], + f'ultra_{weighting}': ultra, + f'accum_{weighting}': accum, + f'coarse_{weighting}': coarse} + .items()} diff --git a/AeroViz/plot/__init__.py b/AeroViz/plot/__init__.py index 93df043..b5b648e 100644 --- a/AeroViz/plot/__init__.py +++ b/AeroViz/plot/__init__.py @@ -3,7 +3,6 @@ from . import optical from .bar import bar from .box import box -from .hysplit import hysplit from .pie import pie, donuts from .radar import radar from .regression import linear_regression, multiple_linear_regression diff --git a/AeroViz/plot/hysplit/__init__.py b/AeroViz/plot/hysplit/__init__.py deleted file mode 100644 index 8b90a72..0000000 --- a/AeroViz/plot/hysplit/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .hysplit import * diff --git a/AeroViz/plot/meteorology/meteorology.py b/AeroViz/plot/meteorology/CBPF.py similarity index 52% rename from AeroViz/plot/meteorology/meteorology.py rename to AeroViz/plot/meteorology/CBPF.py index 11b941c..193fb37 100644 --- a/AeroViz/plot/meteorology/meteorology.py +++ b/AeroViz/plot/meteorology/CBPF.py @@ -1,85 +1,159 @@ import math -from typing import Literal import matplotlib.pyplot as plt import numpy as np import pandas as pd -import windrose from matplotlib.pyplot import Figure, Axes from pandas import DataFrame, Series from scipy.ndimage import gaussian_filter from AeroViz.plot.utils import * -__all__ = ['wind_rose', - 'CBPF' - ] +__all__ = ['CBPF'] + + +def improve_density_estimation(df, WS, WD, val, resolution=100, bandwidth=None): + """ + 改進的密度估計函數,使用KDE方法來產生更平滑的分布 + + Parameters: + ----------- + df : DataFrame + 包含風速風向數據的DataFrame + WS : str + 風速列名 + WD : str + 風向列名 + val : str + 要分析的變量列名 + resolution : int + 網格解析度 + bandwidth : float or tuple + KDE的頻寬參數,如果為None則自動選擇 + """ + from scipy.stats import gaussian_kde + import numpy as np + + # 轉換為笛卡爾坐標 + u = df[WS] * np.sin(np.radians(df[WD])) + v = df[WS] * np.cos(np.radians(df[WD])) + + # 創建網格 + u_range = np.linspace(u.min(), u.max(), resolution) + v_range = np.linspace(v.min(), v.max(), resolution) + U, V = np.meshgrid(u_range, v_range) + + # 準備KDE的位置 + positions = np.vstack([U.ravel(), V.ravel()]) + values = np.vstack([u, v]) + + # 使用KDE進行密度估計 + kernel = gaussian_kde(values, bw_method=bandwidth) + Z = np.reshape(kernel(positions), U.shape) + + # 將密度值歸一化到[0,1]區間 + Z = (Z - Z.min()) / (Z.max() - Z.min()) + + # 應用極坐標遮罩 + center_u = len(u_range) // 2 + center_v = len(v_range) // 2 + max_radius = min(center_u, center_v) + + Y, X = np.ogrid[-center_v:resolution - center_v, -center_u:resolution - center_u] + mask = X * X + Y * Y > max_radius * max_radius + Z[mask] = np.nan + + return Z, U, V + + +def smooth_and_clean(Z, smooth_radius=2, min_density=1): + """ + 平滑並清理密度圖,去除孤立點 + + Parameters: + ----------- + Z : ndarray + 密度估計結果 + smooth_radius : int + 平滑半徑 + min_density : float + 最小密度閾值 + """ + from scipy.ndimage import gaussian_filter + + # 先進行高斯平滑 + Z_smooth = gaussian_filter(Z, sigma=smooth_radius) + + # 去除低於閾值的點 + # Z_smooth[Z_smooth < min_density] = np.nan + + # 去除孤立點 + rows, cols = Z_smooth.shape + for i in range(rows): + for j in range(cols): + if not np.isnan(Z_smooth[i, j]): + # 檢查周圍點 + neighborhood = Z_smooth[ + max(0, i - smooth_radius):min(rows, i + smooth_radius + 1), + max(0, j - smooth_radius):min(cols, j + smooth_radius + 1) + ] + if np.count_nonzero(~np.isnan(neighborhood)) < 1: # 如果周圍有效點太少 + Z_smooth[i, j] = np.nan + + return Z_smooth + + +def is_within_circle(center_row, center_col, row, col, radius): + return np.sqrt((center_row - row) ** 2 + (center_col - col) ** 2) <= radius + + +def remove_lonely_point(filtered_histogram, radius=4, magic_num=13): + rows, cols = filtered_histogram.shape + data_positions = np.where(~np.isnan(filtered_histogram)) + + for row, col in zip(*data_positions): + valid_data_count = 0 + for i in range(max(0, row - radius), min(rows, row + radius + 1)): + for j in range(max(0, col - radius), min(cols, col + radius + 1)): + if (i, j) != (row, col) and is_within_circle(row, col, i, j, radius): + if not np.isnan(filtered_histogram[i, j]): + valid_data_count += 1 + + if valid_data_count <= magic_num: + filtered_histogram[row, col] = np.nan + + return filtered_histogram + + +def fill_nan_with_mean(filtered_histogram, radius=4, magic_num=13): + rows, cols = filtered_histogram.shape + nan_positions = np.where(np.isnan(filtered_histogram)) + + for row, col in zip(*nan_positions): + surrounding_values = [] + surrounding_values_within_one = [] + nan_count = 0 + for i in range(max(0, row - radius), min(rows, row + radius + 1)): + for j in range(max(0, col - radius), min(cols, col + radius + 1)): + if (i, j) != (row, col) and is_within_circle(row, col, i, j, radius): + if np.isnan(filtered_histogram[i, j]): + nan_count += 1 + else: + surrounding_values.append(filtered_histogram[i, j]) -@set_figure(figsize=(4.3, 4)) -def wind_rose(df: DataFrame, - WS: Series | str, - WD: Series | str, - val: Series | str | None = None, - typ: Literal['bar', 'scatter'] = 'scatter', - rlabel_pos: float = 30, - **kwargs - ) -> tuple[Figure, Axes]: - # conditional bivariate probability function (cbpf) python - # https://davidcarslaw.github.io/openair/reference/polarPlot.html - # https://github.com/davidcarslaw/openair/blob/master/R/polarPlot.R - windrose.WindroseAxes._info = 'WindroseAxes' - - df = df.dropna(subset=[WS, WD] + ([val] if val is not None else [])) - - radius = df[WS].to_numpy() - theta = df[WD].to_numpy() - radian = np.radians(theta) - values = df[val].to_numpy() if val is not None else None - - # In this case, the windrose is a simple frequency diagram, - # the function automatically calculates the radians of the given wind direction. - if typ == 'bar': - fig, ax = plt.subplots(figsize=(5.5, 4), subplot_kw={'projection': 'windrose'}) - fig.subplots_adjust(left=0) - - ax.bar(theta, radius, bins=[0, 1, 2, 3], normed=True, colors=['#0F1035', '#365486', '#7FC7D9', '#DCF2F1']) - ax.set( - ylim=(0, 30), - yticks=[0, 15, 30], - yticklabels=['', '15 %', '30 %'], - rlabel_position=rlabel_pos - ) - ax.set_thetagrids(angles=[0, 45, 90, 135, 180, 225, 270, 315], - labels=["E", "NE", "N", "NW", "W", "SW", "S", "SE"]) - - ax.legend(units='m/s', bbox_to_anchor=[1.1, 0.5], loc='center left', ncol=1) - - # In this case, the windrose is a scatter plot, - # in contrary, this function does not calculate the radians, so user have to input the radian. - else: - fig, ax = plt.subplots(figsize=(5, 4), subplot_kw={'projection': 'windrose'}) - fig.subplots_adjust(left=0) - - scatter = ax.scatter(radian, radius, s=15, c=values, vmax=np.quantile(values, 0.90), edgecolors='none', - cmap='jet', alpha=0.8) - ax.set( - ylim=(0, 7), - yticks=[1, 3, 5, 7], - yticklabels=['1 m/s', '3 m/s', '5 m/s', '7 m/s'], - rlabel_position=rlabel_pos, - theta_direction=-1, - theta_zero_location='N', - title=kwargs.get('title', None) - ) - ax.set_thetagrids(angles=[0, 45, 90, 135, 180, 225, 270, 315], - labels=["N", "NE", "E", "SE", "S", "SW", "W", "NW"]) - - plt.colorbar(scatter, ax=ax, label=Unit(val), pad=0.1, fraction=0.04) + for i in range(max(0, row - 2), min(rows, row + 2 + 1)): + for j in range(max(0, col - 2), min(cols, col + 2 + 1)): + if (i, j) != (row, col) and is_within_circle(row, col, i, j, 2): + if np.isnan(filtered_histogram[i, j]): + pass + else: + surrounding_values_within_one.append(filtered_histogram[i, j]) - plt.show() + if nan_count < magic_num and surrounding_values_within_one: + filtered_histogram[row, col] = np.mean(surrounding_values) - return fig, ax + return filtered_histogram # TODO: fix the bug of the CBPF function @@ -90,7 +164,7 @@ def CBPF(df: DataFrame, val: Series | str | None = None, percentile: list | float | int | None = None, max_ws: float | None = 5, - resolution: int = 100, + resolution: int = 50, sigma: float | tuple = 2, rlabel_pos: float = 30, bottom_text: str | bool | None = None, @@ -157,64 +231,18 @@ def CBPF(df: DataFrame, histogram_filled = np.nan_to_num(histogram, nan=0) # 將 NaN 替換為 0 filtered_histogram = gaussian_filter(histogram_filled, sigma=sigma) - filtered_histogram[np.isnan(histogram)] = np.nan - - def is_within_circle(center_row, center_col, row, col, radius): - return np.sqrt((center_row - row) ** 2 + (center_col - col) ** 2) <= radius - - def remove_lonely_point(filtered_histogram, radius=4, magic_num=13): - rows, cols = filtered_histogram.shape - data_positions = np.where(~np.isnan(filtered_histogram)) - - for row, col in zip(*data_positions): - valid_data_count = 0 - for i in range(max(0, row - radius), min(rows, row + radius + 1)): - for j in range(max(0, col - radius), min(cols, col + radius + 1)): - if (i, j) != (row, col) and is_within_circle(row, col, i, j, radius): - if not np.isnan(filtered_histogram[i, j]): - valid_data_count += 1 - - if valid_data_count <= magic_num: - filtered_histogram[row, col] = np.nan - - return filtered_histogram - - def fill_nan_with_mean(filtered_histogram, radius=4, magic_num=13): - rows, cols = filtered_histogram.shape - nan_positions = np.where(np.isnan(filtered_histogram)) - - for row, col in zip(*nan_positions): - surrounding_values = [] - surrounding_values_within_one = [] - nan_count = 0 - - for i in range(max(0, row - radius), min(rows, row + radius + 1)): - for j in range(max(0, col - radius), min(cols, col + radius + 1)): - if (i, j) != (row, col) and is_within_circle(row, col, i, j, radius): - if np.isnan(filtered_histogram[i, j]): - nan_count += 1 - else: - surrounding_values.append(filtered_histogram[i, j]) - - for i in range(max(0, row - 2), min(rows, row + 2 + 1)): - for j in range(max(0, col - 2), min(cols, col + 2 + 1)): - if (i, j) != (row, col) and is_within_circle(row, col, i, j, 2): - if np.isnan(filtered_histogram[i, j]): - pass - else: - surrounding_values_within_one.append(filtered_histogram[i, j]) - - if nan_count < magic_num and surrounding_values_within_one: - filtered_histogram[row, col] = np.mean(surrounding_values) - - return filtered_histogram + # filtered_histogram[np.isnan(histogram)] = np.nan + # breakpoint() + # filtered_histogram = smooth_and_clean(filtered_histogram) # Apply the function to your data - fil_radius, magic_num = 3, 13 - filtered_histogram = remove_lonely_point(filtered_histogram, fil_radius, magic_num) - filtered_histogram = fill_nan_with_mean(filtered_histogram, fil_radius, magic_num) + # fil_radius, magic_num = 3, 13 + # filtered_histogram = remove_lonely_point(filtered_histogram, fil_radius, magic_num) + # filtered_histogram = fill_nan_with_mean(filtered_histogram, fil_radius, magic_num) + if np.all(np.isnan(filtered_histogram)): raise ValueError("All values in the filtered histogram are NaN. Please decrease the resolution.") + # plot fig, ax = plt.subplots() fig.subplots_adjust(left=0) diff --git a/AeroViz/plot/meteorology/__init__.py b/AeroViz/plot/meteorology/__init__.py index d2a0d9f..de5ea1f 100644 --- a/AeroViz/plot/meteorology/__init__.py +++ b/AeroViz/plot/meteorology/__init__.py @@ -1 +1,3 @@ -from .meteorology import * +from .CBPF import CBPF +from .hysplit import hysplit +from .wind_rose import wind_rose diff --git a/AeroViz/plot/hysplit/hysplit.py b/AeroViz/plot/meteorology/hysplit.py similarity index 100% rename from AeroViz/plot/hysplit/hysplit.py rename to AeroViz/plot/meteorology/hysplit.py diff --git a/AeroViz/plot/meteorology/wind_rose.py b/AeroViz/plot/meteorology/wind_rose.py new file mode 100644 index 0000000..6ffc58b --- /dev/null +++ b/AeroViz/plot/meteorology/wind_rose.py @@ -0,0 +1,77 @@ +from typing import Literal + +import matplotlib.pyplot as plt +import numpy as np +import windrose +from matplotlib.pyplot import Figure, Axes +from pandas import DataFrame, Series + +from AeroViz.plot.utils import * + +__all__ = ['wind_rose'] + + +@set_figure(figsize=(4.3, 4)) +def wind_rose(df: DataFrame, + WS: Series | str, + WD: Series | str, + val: Series | str | None = None, + typ: Literal['bar', 'scatter'] = 'scatter', + rlabel_pos: float = 30, + **kwargs + ) -> tuple[Figure, Axes]: + # conditional bivariate probability function (cbpf) python + # https://davidcarslaw.github.io/openair/reference/polarPlot.html + # https://github.com/davidcarslaw/openair/blob/master/R/polarPlot.R + windrose.WindroseAxes._info = 'WindroseAxes' + + df = df.dropna(subset=[WS, WD] + ([val] if val is not None else [])) + + radius = df[WS].to_numpy() + theta = df[WD].to_numpy() + radian = np.radians(theta) + values = df[val].to_numpy() if val is not None else None + + # In this case, the windrose is a simple frequency diagram, + # the function automatically calculates the radians of the given wind direction. + if typ == 'bar': + fig, ax = plt.subplots(figsize=(5.5, 4), subplot_kw={'projection': 'windrose'}) + fig.subplots_adjust(left=0) + + ax.bar(theta, radius, bins=[0, 1, 2, 3], normed=True, colors=['#0F1035', '#365486', '#7FC7D9', '#DCF2F1']) + ax.set( + ylim=(0, 30), + yticks=[0, 15, 30], + yticklabels=['', '15 %', '30 %'], + rlabel_position=rlabel_pos + ) + ax.set_thetagrids(angles=[0, 45, 90, 135, 180, 225, 270, 315], + labels=["E", "NE", "N", "NW", "W", "SW", "S", "SE"]) + + ax.legend(units='m/s', bbox_to_anchor=[1.1, 0.5], loc='center left', ncol=1) + + # In this case, the windrose is a scatter plot, + # in contrary, this function does not calculate the radians, so user have to input the radian. + else: + fig, ax = plt.subplots(figsize=(5, 4), subplot_kw={'projection': 'windrose'}) + fig.subplots_adjust(left=0) + + scatter = ax.scatter(radian, radius, s=15, c=values, vmax=np.quantile(values, 0.90), edgecolors='none', + cmap='jet', alpha=0.8) + ax.set( + ylim=(0, 7), + yticks=[1, 3, 5, 7], + yticklabels=['1 m/s', '3 m/s', '5 m/s', '7 m/s'], + rlabel_position=rlabel_pos, + theta_direction=-1, + theta_zero_location='N', + title=kwargs.get('title', None) + ) + ax.set_thetagrids(angles=[0, 45, 90, 135, 180, 225, 270, 315], + labels=["N", "NE", "E", "SE", "S", "SW", "W", "NW"]) + + plt.colorbar(scatter, ax=ax, label=Unit(val), pad=0.1, fraction=0.04) + + plt.show() + + return fig, ax diff --git a/AeroViz/plot/optical/optical.py b/AeroViz/plot/optical/optical.py index 65d519e..7b606be 100644 --- a/AeroViz/plot/optical/optical.py +++ b/AeroViz/plot/optical/optical.py @@ -3,11 +3,10 @@ import matplotlib.pyplot as plt import numpy as np -# from PyMieScatt import ScatteringFunction from matplotlib.pyplot import Figure, Axes -from AeroViz.plot.optical.PyMieScatt_update import ScatteringFunction -from AeroViz.plot.optical.mie_theory import Mie_Q, Mie_MEE, Mie_PESD +from AeroViz.dataProcess.Optical.PyMieScatt_update import ScatteringFunction +from AeroViz.dataProcess.Optical.mie_theory import Mie_Q, Mie_MEE, Mie_PESD from AeroViz.plot.utils import * __all__ = ['Q_plot', diff --git a/AeroViz/plot/utils/plt_utils.py b/AeroViz/plot/utils/plt_utils.py index b8119d9..ae79ebb 100644 --- a/AeroViz/plot/utils/plt_utils.py +++ b/AeroViz/plot/utils/plt_utils.py @@ -50,7 +50,7 @@ def wrapper(*args, **kwargs): plt.rcParams['legend.labelspacing'] = 0.7 plt.rcParams['figure.figsize'] = figsize or (4, 4) - plt.rcParams['figure.dpi'] = 200 + plt.rcParams['figure.dpi'] = 300 plt.rcParams['figure.autolayout'] = autolayout if not autolayout: diff --git a/AeroViz/rawDataReader/__init__.py b/AeroViz/rawDataReader/__init__.py index f22a54e..a2550d8 100644 --- a/AeroViz/rawDataReader/__init__.py +++ b/AeroViz/rawDataReader/__init__.py @@ -1,5 +1,6 @@ from datetime import datetime from pathlib import Path +from typing import Any from pandas import Grouper, Timedelta @@ -25,7 +26,7 @@ def RawDataReader(instrument_name: str, end: datetime = None, mean_freq: str = '1h', csv_out: bool = True, - ): + **kwargs: Any): """ Factory function to instantiate the appropriate reader module for a given instrument and return the processed data over the specified time range. @@ -107,7 +108,8 @@ def RawDataReader(instrument_name: str, qc=qc, qc_freq=qc_freq, rate=rate, - append_data=append_data + append_data=append_data, + **kwargs ) return reader_module( start=start, diff --git a/AeroViz/rawDataReader/core/__init__.py b/AeroViz/rawDataReader/core/__init__.py index 000ccfd..c2a7baf 100644 --- a/AeroViz/rawDataReader/core/__init__.py +++ b/AeroViz/rawDataReader/core/__init__.py @@ -1,9 +1,9 @@ import json -import logging from abc import ABC, abstractmethod +from contextlib import contextmanager from datetime import datetime from pathlib import Path -from typing import Optional +from typing import Optional, Generator import numpy as np import pandas as pd @@ -12,6 +12,7 @@ from rich.progress import Progress, TextColumn, BarColumn, TimeRemainingColumn, TaskProgressColumn from AeroViz.rawDataReader.config.supported_instruments import meta +from AeroViz.rawDataReader.core.logger import ReaderLogger from AeroViz.rawDataReader.core.qc import DataQualityControl __all__ = ['AbstractReader'] @@ -35,11 +36,12 @@ def __init__(self, qc: bool = True, qc_freq: Optional[str] = None, rate: bool = True, - append_data: bool = False): + append_data: bool = False, + **kwargs): self.path = Path(path) self.meta = meta[self.nam] - self.logger = self._setup_logger() + self.logger = ReaderLogger(self.nam, self.path) self.reset = reset self.qc = qc @@ -53,6 +55,8 @@ def __init__(self, self.csv_nam_raw = self.path / f'_read_{self.nam.lower()}_raw.csv' self.csv_out = self.path / f'output_{self.nam.lower()}.csv' + self.size_range = kwargs.get('size_range', (11.8, 593.5)) + def __call__(self, start: datetime, end: datetime, @@ -78,20 +82,6 @@ def _raw_reader(self, file): def _QC(self, df: DataFrame) -> DataFrame: return df - def _setup_logger(self) -> logging.Logger: - logger = logging.getLogger(self.nam) - logger.setLevel(logging.INFO) - - for handler in logger.handlers[:]: - handler.close() - logger.removeHandler(handler) - - handler = logging.FileHandler(self.path / f'{self.nam}.log') - handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')) - logger.addHandler(handler) - - return logger - def _rate_calculate(self, raw_data, qc_data) -> None: def __base_rate(raw_data, qc_data): period_size = len(raw_data.resample('1h').mean().index) @@ -104,28 +94,27 @@ def __base_rate(raw_data, qc_data): # validate rate calculation if period_size == 0 or sample_size == 0 or qc_size == 0: - print(f'\t\t\033[91m No data for this period... skipping\033[0m') + self.logger.warning(f'\t\t No data for this period... skip') continue - - if period_size < sample_size or sample_size < qc_size: - print( - f'\t\tInvalid size relationship: period={period_size}, sample={sample_size}, QC={qc_size}... skipping') + if period_size < sample_size: + self.logger.warning(f'\t\tError: Sample({sample_size}) > Period({period_size})... skip') + continue + if sample_size < qc_size: + self.logger.warning(f'\t\tError: QC({qc_size}) > Sample({sample_size})... skip') continue else: - _acq_rate = round((sample_size / period_size) * 100, 1) - _yid_rate = round((qc_size / sample_size) * 100, 1) - _OEE_rate = round((qc_size / period_size) * 100, 1) - - self.logger.info(f'{_nam}:') - self.logger.info(f"\tAcquisition rate: {_acq_rate}%") - self.logger.info(f'\tYield rate: {_yid_rate}%') - self.logger.info(f'\tOEE rate: {_OEE_rate}%') - self.logger.info(f"{'=' * 60}") - - print(f'\n\t{_nam} : ') - print(f'\t\tacquisition rate | yield rate -> OEE rate : ' - f'\033[91m{_acq_rate}% | {_yid_rate}% -> {_OEE_rate}%\033[0m') + _sample_rate = round((sample_size / period_size) * 100, 1) + _valid_rate = round((qc_size / sample_size) * 100, 1) + _total_rate = round((qc_size / period_size) * 100, 1) + + self.logger.info(f"\t\t{self.logger.CYAN}▶ {_nam}{self.logger.RESET}") + self.logger.info( + f"\t\t\t├─ {'Sample Rate':15}: {self.logger.BLUE}{_sample_rate:>6.1f}%{self.logger.RESET}") + self.logger.info( + f"\t\t\t├─ {'Valid Rate':15}: {self.logger.BLUE}{_valid_rate:>6.1f}%{self.logger.RESET}") + self.logger.info( + f"\t\t\t└─ {'Total Rate':15}: {self.logger.BLUE}{_total_rate:>6.1f}%{self.logger.RESET}") if self.meta['deter_key'] is not None: # use qc_freq to calculate each period rate @@ -135,9 +124,8 @@ def __base_rate(raw_data, qc_data): for (month, _sub_raw_data), (_, _sub_qc_data) in zip(raw_data_grouped, qc_data_grouped): self.logger.info( - f"\tProcessing: {_sub_raw_data.index[0].strftime('%F')} to {_sub_raw_data.index[-1].strftime('%F')}") - print( - f"\n\tProcessing: {_sub_raw_data.index[0].strftime('%F')} to {_sub_raw_data.index[-1].strftime('%F')}") + f"\t{self.logger.BLUE}▶ Processing: {_sub_raw_data.index[0].strftime('%F')}" + f" to {_sub_raw_data.index[-1].strftime('%F')}{self.logger.RESET}") __base_rate(_sub_raw_data, _sub_qc_data) @@ -201,6 +189,34 @@ def _save_data(self, raw_data: DataFrame, qc_data: DataFrame) -> None: except Exception as e: raise IOError(f"Error saving data. {e}") + @contextmanager + def progress_reading(self, files: list) -> Generator: + # Create message temporary storage and replace logger method + logs = {level: [] for level in ['info', 'warning', 'error']} + original = {level: getattr(self.logger, level) for level in logs} + + for level, msgs in logs.items(): + setattr(self.logger, level, msgs.append) + + try: + with Progress( + TextColumn("[bold blue]{task.description}", style="bold blue"), + BarColumn(bar_width=25, complete_style="green", finished_style="bright_green"), + TaskProgressColumn(), + TimeRemainingColumn(), + TextColumn("{task.fields[filename]}", style="yellow"), + console=Console(force_terminal=True, color_system="auto", width=120), + expand=False + ) as progress: + task = progress.add_task(f"▶ Reading {self.nam} files", total=len(files), filename="") + yield progress, task + finally: + # Restore logger method and output message + for level, msgs in logs.items(): + setattr(self.logger, level, original[level]) + for msg in msgs: + original[level](msg) + def _read_raw_files(self) -> tuple[DataFrame | None, DataFrame | None]: files = [f for file_pattern in self.meta['pattern'] @@ -212,37 +228,28 @@ def _read_raw_files(self) -> tuple[DataFrame | None, DataFrame | None]: raise FileNotFoundError(f"No files in '{self.path}' could be read. Please check the current path.") df_list = [] - with Progress( - TextColumn("[bold blue]{task.description}", style="bold blue"), - BarColumn(bar_width=18, complete_style="green", finished_style="bright_green"), - TaskProgressColumn(), - TimeRemainingColumn(), - TextColumn("{task.fields[filename]}", style="yellow"), - console=Console(force_terminal=True, color_system="auto"), - expand=False - ) as progress: - task = progress.add_task(f"Reading {self.nam} files", total=len(files), filename="") + + # Context manager for progress bar display + with self.progress_reading(files) as (progress, task): for file in files: progress.update(task, advance=1, filename=file.name) try: - df = self._raw_reader(file) - - if df is not None and not df.empty: + if (df := self._raw_reader(file)) is not None and not df.empty: df_list.append(df) else: - self.logger.warning(f"File {file.name} produced an empty DataFrame or None.") - - except pd.errors.ParserError as e: - self.logger.error(f"Error tokenizing data: {e}") + self.logger.warning(f"\tFile {file.name} produced an empty DataFrame or None.") except Exception as e: self.logger.error(f"Error reading {file.name}: {e}") if not df_list: - raise ValueError("All files were either empty or failed to read.") + raise ValueError(f"\033[41m\033[97mAll files were either empty or failed to read.\033[0m") raw_data = concat(df_list, axis=0).groupby(level=0).first() + if self.nam == 'SMPS': + raw_data = raw_data.sort_index(axis=1, key=lambda x: x.astype(float)) + raw_data = self._timeIndex_process(raw_data).apply(to_numeric, errors='coerce').copy(deep=True) qc_data = self._QC(raw_data).apply(to_numeric, errors='coerce').copy(deep=True) @@ -251,29 +258,28 @@ def _read_raw_files(self) -> tuple[DataFrame | None, DataFrame | None]: def _run(self, user_start, user_end): # read pickle if pickle file exists and 'reset=False' or process raw data or append new data if self.pkl_nam_raw.exists() and self.pkl_nam.exists() and not self.reset: - print(f"\n{datetime.now().strftime('%m/%d %X')} : Reading {self.nam} \033[96mPICKLE\033[0m " - f"from {user_start} to {user_end}\n") + self.logger.info_box(f"Reading {self.nam} PICKLE from {user_start} to {user_end}", color_part="PICKLE") _f_raw_done, _f_qc_done = read_pickle(self.pkl_nam_raw), read_pickle(self.pkl_nam) if self.append: - print(f"Appending new data from {user_start} to {user_end}") + self.logger.info_box(f"Appending New data from {user_start} to {user_end}", color_part="New data") + _f_raw_new, _f_qc_new = self._read_raw_files() _f_raw = self._timeIndex_process(_f_raw_done, append_df=_f_raw_new) _f_qc = self._timeIndex_process(_f_qc_done, append_df=_f_qc_new) + else: _f_raw, _f_qc = _f_raw_done, _f_qc_done + return _f_qc if self.qc else _f_raw else: - print(f"\n{datetime.now().strftime('%m/%d %X')} : Reading {self.nam} \033[96mRAW DATA\033[0m " - f"from {user_start} to {user_end}\n") + self.logger.info_box(f"Reading {self.nam} RAW DATA from {user_start} to {user_end}", color_part="RAW DATA") _f_raw, _f_qc = self._read_raw_files() # process time index - data_start, data_end = _f_raw.index.sort_values()[[0, -1]] - _f_raw = self._timeIndex_process(_f_raw, user_start, user_end) _f_qc = self._timeIndex_process(_f_qc, user_start, user_end) _f_qc = self._outlier_process(_f_qc) @@ -281,15 +287,8 @@ def _run(self, user_start, user_end): # save self._save_data(_f_raw, _f_qc) - self.logger.info(f"{'=' * 60}") - self.logger.info(f"Raw data time : {data_start} to {data_end}") - self.logger.info(f"Output time : {user_start} to {user_end}") - self.logger.info(f"{'-' * 60}") - if self.rate: - _f_raw = _f_raw.apply(to_numeric, errors='coerce') - _f_qc = _f_qc.apply(to_numeric, errors='coerce') - self._rate_calculate(_f_raw, _f_qc) + self._rate_calculate(_f_raw.apply(to_numeric, errors='coerce'), _f_qc.apply(to_numeric, errors='coerce')) return _f_qc if self.qc else _f_raw diff --git a/AeroViz/rawDataReader/core/logger.py b/AeroViz/rawDataReader/core/logger.py new file mode 100644 index 0000000..acb0706 --- /dev/null +++ b/AeroViz/rawDataReader/core/logger.py @@ -0,0 +1,78 @@ +import logging +import re +import sys +from pathlib import Path + + +class ReaderLogger: + def __init__(self, name: str, log_path: Path): + self.name = name + self.log_path = log_path + + # ANSI color codes + self.CYAN = '\033[96m' + self.BLUE = '\033[94m' + self.GREEN = '\033[92m' + self.YELLOW = '\033[93m' + self.RED = '\033[91m' + self.RESET = '\033[0m' + + self.logger = self._setup_logger() + + def _setup_logger(self) -> logging.Logger: + logger = logging.getLogger(self.name) + logger.setLevel(logging.INFO) + + # Remove existing handlers + for handler in logger.handlers[:]: + handler.close() + logger.removeHandler(handler) + + # clean ANSI formatter (for log file) + class CleanFormatter(logging.Formatter): + def format(self, record): + formatted_msg = super().format(record) + return re.sub(r'\033\[[0-9;]*m', '', formatted_msg) + + # Set up handlers + file_handler = logging.FileHandler(self.log_path / f'{self.name}.log') + file_handler.setFormatter(CleanFormatter('%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')) + + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(logging.Formatter('%(message)s')) + + logger.addHandler(file_handler) + logger.addHandler(console_handler) + + return logger + + def info(self, msg: str): + self.logger.info(msg) + + def warning(self, msg: str): + self.logger.warning(msg) + + def error(self, msg: str): + self.logger.error(msg) + + def info_box(self, text: str, color_part: str = None, width: int = 80): + """ + Create a boxed message with optional colored text + + Args: + text: Base text format (e.g., "Reading {} RAW DATA from {} to {}") + color_part: Part of text to be colored (e.g., "RAW DATA") + width: Box width + """ + display_text = text.replace(color_part, " " * len(color_part)) if color_part else text + + left_padding = " " * ((width - len(display_text)) // 2) + right_padding = " " * (width - len(display_text) - len(left_padding)) + + content = text.replace(color_part, f"{self.CYAN}{color_part}{self.RESET}") if color_part else text + + __content__ = f"{left_padding}{content}{right_padding}" + + self.info(f"╔{'═' * width}╗") + self.info(f"║{__content__}║") + self.info(f"╚{'═' * width}╝") diff --git a/AeroViz/rawDataReader/script/EPA.py b/AeroViz/rawDataReader/script/EPA.py index 87d63f4..238e757 100644 --- a/AeroViz/rawDataReader/script/EPA.py +++ b/AeroViz/rawDataReader/script/EPA.py @@ -18,7 +18,7 @@ def _raw_reader(self, file): on_bad_lines='skip') if len(df.groupby('測站')) > 1: - raise ValueError(f'Multiple stations found in the file: {df['測站'].unique()}') + raise ValueError(f"Multiple stations found in the file: {df['測站'].unique()}") else: if '測站' in df.columns: df.drop(columns=['測站'], inplace=True) diff --git a/AeroViz/rawDataReader/script/Minion.py b/AeroViz/rawDataReader/script/Minion.py index 4190686..be5f0b9 100644 --- a/AeroViz/rawDataReader/script/Minion.py +++ b/AeroViz/rawDataReader/script/Minion.py @@ -149,7 +149,7 @@ def XRF_QAQC(self, columns_to_convert = [col for col in MDL.keys() if col in df.columns] df[columns_to_convert] = df[columns_to_convert].div(1000) - self.logger.info(f"XRF QAQC summary: transform values below MDL to {MDL_replace}") + self.logger.info(f"\t{'XRF QAQC summary':21}: transform values below MDL to {MDL_replace}") return df @@ -206,9 +206,10 @@ def IGAC_QAQC(self, # 計算保留的数據的百分比 retained_percentage = (valid_mask.sum() / len(df)) * 100 - self.logger.info(f"Ions balance summary: {retained_percentage.__round__(0)}% within tolerance ± {tolerance}") + self.logger.info( + f"\t{'Ions balance summary':21}: {retained_percentage.__round__(0)}% within tolerance ± {tolerance}") if retained_percentage < 70: - self.logger.warning("Warning: The percentage of retained data is less than 70%") + self.logger.warning("\tWarning: The percentage of retained data is less than 70%") return df diff --git a/AeroViz/rawDataReader/script/SMPS.py b/AeroViz/rawDataReader/script/SMPS.py index 00de43d..5620af7 100644 --- a/AeroViz/rawDataReader/script/SMPS.py +++ b/AeroViz/rawDataReader/script/SMPS.py @@ -1,7 +1,7 @@ import csv import numpy as np -from pandas import to_datetime, to_numeric, read_csv, isna +from pandas import to_datetime, to_numeric, read_csv from AeroViz.rawDataReader.core import AbstractReader @@ -40,7 +40,7 @@ def _raw_reader(self, file): for date_format in date_formats: _time_index = parse_date(_df, date_format) - if not isna(_time_index).all(): + if not _time_index.isna().all(): break else: raise ValueError("Unable to parse dates with given formats") @@ -56,14 +56,17 @@ def _raw_reader(self, file): _df_smps.columns = _df_smps.columns.astype(float) _df_smps = _df_smps.loc[_df_smps.index.dropna().copy()] - if _df_smps.columns[0] != 11.8: - print(f'file_name: {file.name}') - return None + if _df_smps.columns[0] != self.size_range[0] or _df_smps.columns[-1] != self.size_range[1]: + self.logger.info(f'\tSMPS file: {file.name} is not match the default size range {self.size_range}, ' + f'it is ({_df_smps.columns[0]}, {_df_smps.columns[-1]})') return _df_smps.apply(to_numeric, errors='coerce') # QC data def _QC(self, _df): + size_range_mask = (_df.columns.astype(float) >= self.size_range[0]) & ( + _df.columns.astype(float) <= self.size_range[1]) + _df = _df.loc[:, size_range_mask] # mask out the data size lower than 7 _df['total'] = _df.sum(axis=1, min_count=1) * (np.diff(np.log(_df.keys().to_numpy(float)))).mean() @@ -74,8 +77,7 @@ def _QC(self, _df): _df = _df.mask(_df['total'] < 2000) # remove the bin over 400 nm which num. conc. larger than 4000 - _df_remv_ky = _df.keys()[:-2][_df.keys()[:-2] >= 400.] - + _df_remv_ky = _df.keys()[:-1][_df.keys()[:-1] >= 400.] _df[_df_remv_ky] = _df[_df_remv_ky].copy().mask(_df[_df_remv_ky] > 4000.) return _df[_df.keys()[:-1]] diff --git a/AeroViz/tools/__init__.py b/AeroViz/tools/__init__.py index f917fe6..3b64717 100644 --- a/AeroViz/tools/__init__.py +++ b/AeroViz/tools/__init__.py @@ -1,3 +1,2 @@ from .database import DataBase from .dataclassifier import DataClassifier -from .datareader import DataReader diff --git a/AeroViz/tools/datareader.py b/AeroViz/tools/datareader.py deleted file mode 100644 index 7ead768..0000000 --- a/AeroViz/tools/datareader.py +++ /dev/null @@ -1,66 +0,0 @@ -from abc import ABC, abstractmethod -from pathlib import Path - -from pandas import read_csv, read_json, read_excel, DataFrame - - -class FileHandler(ABC): - """ An abstract base class for reading data files with different extensions (.csv, .json, .xls, .xlsx). """ - - @abstractmethod - def read_data(self, file_path: Path) -> DataFrame: - pass - - -class CsvFileHandler(FileHandler): - def read_data(self, file_path: Path) -> DataFrame: - return read_csv(file_path, na_values=('E', 'F', '-', '_', '#', '*'), index_col=0, parse_dates=True, - low_memory=False) - - -class JsonFileHandler(FileHandler): - def read_data(self, file_path: Path) -> DataFrame: - return read_json(file_path) - - -class ExcelFileHandler(FileHandler): - def read_data(self, file_path: Path) -> DataFrame: - return read_excel(file_path, index_col=0, parse_dates=True, ) - - -class DataReaderFactory: - _handler_mapping = { - '.csv': CsvFileHandler(), - '.json': JsonFileHandler(), - '.xls': ExcelFileHandler(), - '.xlsx': ExcelFileHandler(), - } - - @staticmethod - def create_handler(file_extension: str) -> FileHandler: - reader_class = DataReaderFactory._handler_mapping.get(file_extension) - if reader_class is None: - raise ValueError(f"Unsupported file format: {file_extension}") - return reader_class - - -class DataReader: - """ - A class for reading data files with different extensions (.csv, .json, .xls, .xlsx). - - Parameters - ---------- - filename (Path | str): The name of the file to be read or the Path of the file. - - Returns - ------- - pandas.DataFrame: data - - Examples - -------- - >>> psd = DataReader(Path(...)) - """ - - def __new__(cls, file_path: Path | str) -> DataFrame: - file_path = Path(file_path) - return DataReaderFactory.create_handler(file_path.suffix.lower()).read_data(file_path) diff --git a/README.md b/README.md index 729a4d5..2486ae5 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ ##
+