Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

V 5 #4

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions .github/workflows/csv2readme.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import os
import csv
import textwrap
import datetime

def update_readme():
csv_dir = '.github/excel2csv'
readme_path = 'README.md'

if not os.path.exists(csv_dir):
print(f"Directory {csv_dir} does not exist.")
return

files = []
names = []
emails = []
comments = []
line_counts = []

for filename in os.listdir(csv_dir):
if filename.endswith('.csv'):
with open(os.path.join(csv_dir, filename), 'r') as csvfile:
csvreader = csv.reader(csvfile)
current_name = ''
current_email = ''
current_comment = ''
line_count = 0
for i, row in enumerate(csvreader):
if i == 0:
current_name = row[1].strip() # Extract Name
elif i == 1:
current_email = row[1].strip() # Extract Email
elif i == 4:
current_comment = row[1].strip() # Extract Comment
# Count lines starting from line 8
if i >= 7:
line_count += 1
# Append the extracted data for the current file to the lists
files.append(filename)
names.append(current_name)
emails.append(current_email)
comments.append(current_comment)
line_counts.append(line_count)

if not files:
print("No CSV files found. Skipping README update.")
return

readme = ''

if files:
# Writes README.md preamble
readme += textwrap.dedent(f'''
## This Dataset Contributions

**Name:** {' / '.join(set(names))}
<br>
**Email:** {' / '.join(set(emails))}
''')

for i, comment in enumerate(comments):
readme += textwrap.dedent(f'''
```
File: {files[i]}
Datapoints: {line_counts[i]}
Comment: {comment}
```
''')

readme += textwrap.dedent(f'''
**Last time updated:** {datetime.datetime.now().strftime("%m-%d-%Y %I:%M%p").lower()}
''')

readme += textwrap.dedent('''
## The ULTERA Database
This template repository was developed for contributing to the [**ULTERA Database**](https://ultera.org) carried under the [**DOE ARPA-E ULTIMATE program**](https://arpa-e.energy.gov/?q=arpa-e-programs/ultimate) that aims to develop a new generation of materials for turbine blades in gas turbines and related applications.

The main scope of this dataset is collecting data on compositionally complex alloys (CCAs), also known as high entropy alloys (HEAs) and multi-principle-element alloys (MPEAs), with extra attention given to (1) high-temperature (refractory) mechanical data, (2) phases present under different processing conditions. Although low-entropy alloys (incl. binaries) are typically not presented to the end-user (or counted in statistics), some are present and used in ML efforts; thus **all high-quality alloy data contributions are welcome!**

For further information, please visit the [ULTERA-contribute](https://github.com/PhasesResearchLab/ULTERA-contribute/) repository.
''')

with open(readme_path, 'w') as readme_file:
readme_file.write(readme)

print(f"README.md has been updated with the latest contributions.")

if __name__ == '__main__':
update_readme()
Original file line number Diff line number Diff line change
@@ -1,83 +1,86 @@
# %%
import pandas as pd
import sys
import json
import os


def convert(datasheet: str):
'''This function converts an PyQAlloy-compliant Excel datasheet into a CSV file for the purpose of
tracking changes in the data collection and curation, while preserving the original template/datasheet
file along with its style and formatting. The CSV file is named after the original datasheet file, with
the extension changed to .csv. The metadata is stored in the first few lines of the CSV file, and the
data is stored in the rest of the file.

Args:
datasheet: Path to PyQAlloy-compliant Excel datasheet file.
'''

# Import metadata
print('Reading the metadata.')
metaDF = pd.read_excel(datasheet,
usecols="A:F",
nrows=4)
meta = metaDF.to_json(orient="split")
metaParsed = json.loads(meta, strict=False)['data']

# Format metadata into a dictionary
metaData = {
'Name': metaParsed[0][1],
'Email': metaParsed[1][1],
'Direct Fetched': metaParsed[2][1],
'Hand Fetched': metaParsed[3][1],
'Comment': metaParsed[0][5]
}

# Logging progress into a CSV table
dataFileName = datasheet.replace('.xlsx', '').replace('.xls', '')

# Import data
print('Importing data.')
df2 = pd.read_excel(datasheet,
usecols="A:N",
nrows=20000,
skiprows=8)
# Convert the dataset
parsed = df2.to_json(orient="split")
labels = json.loads(parsed, strict=False)['columns']
data = json.loads(parsed, strict=False)['data']

print('Imported ' + str(len(data)) + ' datapoints.')

with open(dataFileName + '.csv', 'w+') as outFile:
# Write the metadata
for line, val in metaData.items():
outFile.write(line + ':,' + str(val) + '\n')
outFile.write('\n')
# Write the data
outFile.write(','.join(labels) + '\n')
for line in data:
outFile.write(','.join(str(val) for val in line) + '\n')

print('Successfully converted ' + datasheet + ' to ' + dataFileName + '.csv\n')


def detectDatasheetsAndConvert(path: str):
'''This function detects all PyQAlloy-compliant Excel datasheets in a directory and converts them into
CSV files. It skips the empty template file (template_v4.xlsx).

Args:
path: Path to the directory containing PyQAlloy-compliant Excel datasheets.
'''

for file in os.listdir(path):
if file.endswith('.xlsx'):
if file not in ['template_v4.xlsx', 'template_v4_DatasetExample.xlsx']:
print('Converting ' + file)
convert(path + '/' + file)
else:
print('Skipping ' + file)


if __name__ == '__main__':
detectDatasheetsAndConvert(sys.argv[1])
import pandas as pd
import fnmatch
import sys
import json
import os

def convert(datasheet: str):
'''This function converts an ULTERA-compliant Excel datasheet into a CSV file for the purpose of
tracking changes in the data collection and curation, while preserving the original template/datasheet
file along with its style and formatting. The CSV file is named after the original datasheet file, with
the extension changed to .csv. The metadata is stored in the first few lines of the CSV file, and the
data is stored in the rest of the file.

Args:
datasheet: Path to ULTERA-compliant Excel datasheet file.
'''

# Import metadata
print('Reading the metadata.')
metaDF = pd.read_excel(datasheet,
usecols="A:F",
nrows=4)
meta = metaDF.to_json(orient="split")
metaParsed = json.loads(meta, strict=False)['data']

# Format metadata into a dictionary
metaData = {
'Name': metaParsed[0][1],
'Email': metaParsed[1][1],
'Direct Fetched': metaParsed[2][1],
'Hand Fetched': metaParsed[3][1],
'Comment': metaParsed[0][5]
}

# Logging progress into a CSV table
dataFileName = datasheet.replace('.xlsx', '').replace('.xls', '')

# Import data
print('Importing data.')
df2 = pd.read_excel(datasheet,
usecols="A:N",
nrows=20000,
skiprows=8)
# Convert the dataset
parsed = df2.to_json(orient="split")
labels = json.loads(parsed, strict=False)['columns']
data = json.loads(parsed, strict=False)['data']

print('Imported ' + str(len(data)) + ' datapoints.')

# Ensure the directory exists
output_dir = '.github/excel2csv'
os.makedirs(output_dir, exist_ok=True)

with open(f'{output_dir}/{dataFileName}.csv', 'w+') as outFile:
# Write the metadata
for line, val in metaData.items():
outFile.write(line + ':,' + str(val) + '\n')
outFile.write('\n')
# Write the data
outFile.write(','.join(labels) + '\n')
for line in data:
outFile.write(','.join(str(val) for val in line) + '\n')

print(f'Successfully converted {datasheet} to {output_dir}/{dataFileName}.csv\n')


def detectDatasheetsAndConvert(path: str):
'''This function detects all ULTERA-compliant Excel datasheets in a directory and converts them into
CSV files. It skips the empty template file.

Args:
path: Path to the directory containing ULTERA-compliant Excel datasheets.
'''

for file in os.listdir(path):
if file.endswith('.xlsx'):
if not fnmatch.fnmatch(file, 'template*.xlsx'):
print('Converting ' + file)
convert(path + '/' + file)
else:
print('Skipping ' + file)


if __name__ == '__main__':
detectDatasheetsAndConvert(sys.argv[1])
19 changes: 19 additions & 0 deletions .github/workflows/newfork.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: Fork Notification
on: fork

jobs:
create-fork-issue:
runs-on: ubuntu-latest
permissions:
issues: write
steps:
- uses: actions/github-script@v6
with:
script: |
await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: 'New Contribute Fork Created',
body: `A new fork of this repository has been created by @${context.actor}\n\n@PhasesResearchLab/ultera-maintainers`,
labels: ['new fork']
});
Original file line number Diff line number Diff line change
@@ -1,32 +1,36 @@
name: excel2csv

on: [push]

jobs:
run:
name: excel2csv
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: '3.10'
cache: 'pip'
cache-dependency-path: 'pyqalloy-contribute/requirements.txt'

- name: Install Dependencies
run: |
python -m pip install -r pyqalloy-contribute/requirements.txt

- name: Run excel2csv
run: |
python pyqalloy-contribute/pyqalloy-contribute/excel2csv.py .

- name: Commit changes with Add & Commit
uses: EndBug/add-and-commit@v9
with:
message: '(automatic) excel2csv Action for Data Tracking'
add: '*.csv'
name: postcommit

on: [push]

jobs:
run:
name: postcommit
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: '3.10'

- name: Install Dependencies
run: |
python -m pip install pandas openpyxl

- name: Run excel2csv
run: |
python .github/workflows/excel2csv.py .

- name: Run csv2readme
run: |
python .github/workflows/csv2readme.py .

- name: Commit changes with Add & Commit
uses: EndBug/add-and-commit@v9
with:
message: '(automatic) Action for Data Tracking'
add: |
.github/excel2csv/*.csv
README.md
Loading
Loading