Merge pull request #2 from PhasesResearchLab/CSVcast

Add automated data casting to git-trackable CSV format
PhasesResearchLab · Jul 12, 2023 · 8920d9f · 8920d9f
2 parents ede55b8 + 09d3e96
commit 8920d9f
Show file tree

Hide file tree

Showing 5 changed files with 117 additions and 0 deletions.
diff --git a/.github/workflows/excel2csv.yml b/.github/workflows/excel2csv.yml
@@ -0,0 +1,32 @@
+name: excel2csv
+
+on: [push]
+
+jobs:
+  run:
+    name: excel2csv
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+          cache-dependency-path: 'pyqalloy-contribute/requirements.txt'
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install -r pyqalloy-contribute/requirements.txt
+
+      - name: Run excel2csv
+        run: |
+          python pyqalloy-contribute/pyqalloy-contribute/excel2csv.py .
+
+      - name: Commit changes with Add & Commit
+        uses: EndBug/add-and-commit@v9
+        with:
+          message: '(automatic) excel2csv Action for Data Tracking'
+          add: '*.csv'
diff --git a/pyqalloy-contribute/pyqalloy-contribute/excel2csv.py b/pyqalloy-contribute/pyqalloy-contribute/excel2csv.py
@@ -0,0 +1,83 @@
+# %%
+import pandas as pd
+import sys
+import json
+import os
+
+
+def convert(datasheet: str):
+    '''This function converts an PyQAlloy-compliant Excel datasheet into a CSV file for the purpose of
+    tracking changes in the data collection and curation, while preserving the original template/datasheet
+    file along with its style and formatting. The CSV file is named after the original datasheet file, with
+    the extension changed to .csv. The metadata is stored in the first few lines of the CSV file, and the
+    data is stored in the rest of the file.
+
+    Args:
+        datasheet: Path to PyQAlloy-compliant Excel datasheet file.
+    '''
+
+    # Import metadata
+    print('Reading the metadata.')
+    metaDF = pd.read_excel(datasheet,
+                           usecols="A:F",
+                           nrows=4)
+    meta = metaDF.to_json(orient="split")
+    metaParsed = json.loads(meta, strict=False)['data']
+
+    # Format metadata into a dictionary
+    metaData = {
+        'Name': metaParsed[0][1],
+        'Email': metaParsed[1][1],
+        'Direct Fetched': metaParsed[2][1],
+        'Hand Fetched': metaParsed[3][1],
+        'Comment': metaParsed[0][5]
+    }
+
+    # Logging progress into a CSV table
+    dataFileName = datasheet.replace('.xlsx', '').replace('.xls', '')
+
+    # Import data
+    print('Importing data.')
+    df2 = pd.read_excel(datasheet,
+                        usecols="A:N",
+                        nrows=20000,
+                        skiprows=8)
+    # Convert the dataset
+    parsed = df2.to_json(orient="split")
+    labels = json.loads(parsed, strict=False)['columns']
+    data = json.loads(parsed, strict=False)['data']
+
+    print('Imported ' + str(len(data)) + ' datapoints.')
+
+    with open(dataFileName + '.csv', 'w+') as outFile:
+        # Write the metadata
+        for line, val in metaData.items():
+            outFile.write(line + ':,' + str(val) + '\n')
+        outFile.write('\n')
+        # Write the data
+        outFile.write(','.join(labels) + '\n')
+        for line in data:
+            outFile.write(','.join(str(val) for val in line) + '\n')
+
+        print('Successfully converted ' + datasheet + ' to ' + dataFileName + '.csv\n')
+
+
+def detectDatasheetsAndConvert(path: str):
+    '''This function detects all PyQAlloy-compliant Excel datasheets in a directory and converts them into
+    CSV files. It skips the empty template file (template_v4.xlsx).
+
+    Args:
+        path: Path to the directory containing PyQAlloy-compliant Excel datasheets.
+    '''
+
+    for file in os.listdir(path):
+        if file.endswith('.xlsx'):
+            if file not in ['template_v4.xlsx', 'template_v4_DatasetExample.xlsx']:
+                print('Converting ' + file)
+                convert(path + '/' + file)
+            else:
+                print('Skipping ' + file)
+
+
+if __name__ == '__main__':
+    detectDatasheetsAndConvert(sys.argv[1])
diff --git a/pyqalloy-contribute/requirements.txt b/pyqalloy-contribute/requirements.txt
@@ -0,0 +1,2 @@
+pandas
+openpyxl
diff --git a/templateSampleFilled4.xlsx b/templateSampleFilled4.xlsx
diff --git a/template_v4_DatasetExample.xlsx b/template_v4_DatasetExample.xlsx