Skip to content

Commit

Permalink
Review changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Joel-Joseph-George committed Feb 13, 2024
1 parent 19970c1 commit a0687dc
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 79 deletions.
File renamed without changes.
11 changes: 5 additions & 6 deletions Dataupload/readme.MD
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# DataUpload
# Data Upload

This Python script tackles the challenge of streamlining data uploads to a database. Instead of manually uploading individual files, it leverages the power of automation, condensing the entire process into a single, user-friendly script. You no longer need to deal with tedious, repetitive data uploads – one command is all it takes to initiate the data flow.

But this script goes beyond mere convenience. It starts by efficiently extracting data from a specified CSV file. It then meticulously transforms that raw data into a format perfectly suited for your database, ensuring seamless ingestion. Finally, with a single command, it seamlessly transmits the prepared data to a designated endpoint, completing the entire data upload journey in an automated and error-free manner.

### Contents:

Within this directory, you will discover two Python scripts: `fetching.py` and `upload.py.`
Within this directory, you will discover two Python scripts: `extract.py` and `upload.py.`

#### Automated Data Extraction:

Handling large SQL dump files, particularly those of 2GB size, can be a daunting and time-consuming task when manually extracting tables. To streamline this process, the `fetching.py` script has been developed. This script systematically traverses your SQL dump, capturing data and organizing it into CSV files. Each CSV file is named after its corresponding table, ensuring convenient access for future use.
Handling large SQL dump files, particularly those of 2GB size, can be a daunting and time-consuming task when manually extracting tables. To streamline this process, the `extract.py` script has been developed. This script systematically traverses your SQL dump, capturing data and organizing it into CSV files. Each CSV file is named after its corresponding table, ensuring convenient access for future use.


#### Data Upload Utility:
Expand All @@ -22,7 +22,6 @@ Complementing the data extraction process, `upload.py` facilitates seamless data
The Python script responsible for data upload incorporates functionality to upload data to a specified schema within the database. If the specified schema does not exist, the script dynamically creates it, ensuring data organization and accessibility.




## Implementation Details

Expand All @@ -40,8 +39,8 @@ The Python script responsible for data upload incorporates functionality to uplo

### Execution Command :

To run the fetching script you can use this command
```python3 fetching.py```
To run the extract script you can use this command
```python3 extract.py```

### Execution Command:

Expand Down
11 changes: 2 additions & 9 deletions Dataupload/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,3 @@
certifi==2021.10.8
charset-normalizer==2.0.12
idna==3.3
psycopg2
requests==2.27.1
urllib3==1.26.8
fastapi[all]==0.95.0
SQLAlchemy==2.0.9
jsonpickle==2.2.0
pytz==2023.3
requests==2.27.1
psycopg2
121 changes: 57 additions & 64 deletions Dataupload/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def create_database_schema():
postgres_schema = os.environ.get("VACHAN_POSTGRES_SCHEMA", "vachan_cms_rest_12")

#if you want to use a new schema, you can use below code to specify the name.
# postgres_schema = os.environ.get("VACHAN_POSTGRES_SCHEMA", "DataUpload")
# postgres_schema = os.environ.get("VACHAN_POSTGRES_SCHEMA", "<schema_name>"")


encoded_password = urllib.parse.quote(postgres_password, safe='')
Expand All @@ -49,7 +49,6 @@ def create_database_schema():


#creating token
#Token is now disabled since cms is not integrated with auth. When it is , the token need to be enabled with headers
# LOGIN_URL = '/v2/user/login'
# SUPER_USER = os.environ.get("VACHAN_SUPER_USERNAME")
# SUPER_PASSWORD = os.environ.get("VACHAN_SUPER_PASSWORD")
Expand Down Expand Up @@ -476,59 +475,59 @@ def add_bible(csv_file_path):
return usfm_list

#==========================================================================================================================
# def add_parascriptual(csv_file_path): #Only use if you want to add new parascriptual data using folder "parascriptuals"
# data_list = []

# try:
# with open(csv_file_path, 'r', encoding='utf-8') as file:
# # Create a CSV reader
# reader = csv.DictReader(file)

# # Assuming the first row is the header
# for row in reader:
# try:
# # Extracting required fields
# reference_data = json.loads(row['reference'])
# reference = {
# "book": reference_data['book'],
# "chapter": reference_data.get('chapter', 0),
# "verseNumber": reference_data.get('verseNumber', 0),
# "bookEnd": reference_data.get('bookEnd', ''),
# "chapterEnd": reference_data.get('chapterEnd', 0),
# "verseEnd": reference_data.get('verseEnd', 0)
# }
# except KeyError:
# print(f"Error: 'reference' column does not contain required keys in row: {row}")
# continue
# except json.JSONDecodeError:
# print(f"Error: 'reference' column contains invalid JSON format in row: {row}")
# continue

# # Constructing data dictionary
# data = {
# "category": row.get('category', ''),
# "title": row.get('title', ''),
# "description": row.get('description', ''),
# "content": row.get('content', ''),
# "reference": reference,
# "link": row.get('link', ''),
# "metaData": json.loads(row.get('metadata', '{}')),
# "active": row.get('active', '') == 't'
# }

# data_list.append(data)

# except FileNotFoundError:
# print(f"Error: File '{csv_file_path}' not found.")
# except Exception as e:
# print(f"An error occurred while processing {csv_file_path}: {str(e)}")

# return data_list

# data = add_parascriptual('files4/ml_TBP_1_parascriptural.csv')
# resource_name = 'ml_TBP_1_parascriptural'
# parascript_url = f"/resources/parascripturals/{resource_name}"
# upload_data(data, parascript_url)
def add_parascriptual(csv_file_path): #Only use if you want to add new parascriptual.
data_list = []

try:
with open(csv_file_path, 'r', encoding='utf-8') as file:
# Create a CSV reader
reader = csv.DictReader(file)

# Assuming the first row is the header
for row in reader:
try:
# Extracting required fields
reference_data = json.loads(row['reference'])
reference = {
"book": reference_data['book'],
"chapter": reference_data.get('chapter', 0),
"verseNumber": reference_data.get('verseNumber', 0),
"bookEnd": reference_data.get('bookEnd', ''),
"chapterEnd": reference_data.get('chapterEnd', 0),
"verseEnd": reference_data.get('verseEnd', 0)
}
except KeyError:
print(f"Error: 'reference' column does not contain required keys in row: {row}")
continue
except json.JSONDecodeError:
print(f"Error: 'reference' column contains invalid JSON format in row: {row}")
continue

# Constructing data dictionary
data = {
"category": row.get('category', ''),
"title": row.get('title', ''),
"description": row.get('description', ''),
"content": row.get('content', ''),
"reference": reference,
"link": row.get('link', ''),
"metaData": json.loads(row.get('metadata', '{}')),
"active": row.get('active', '') == 't'
}

data_list.append(data)

except FileNotFoundError:
print(f"Error: File '{csv_file_path}' not found.")
except Exception as e:
print(f"An error occurred while processing {csv_file_path}: {str(e)}")

return data_list

data = add_parascriptual('files4/ml_TBP_1_parascriptural.csv')
resource_name = 'ml_TBP_1_parascriptural'
parascript_url = f"/resources/parascripturals/{resource_name}"
upload_data(data, parascript_url)

#==========================================================================================================================

Expand All @@ -554,7 +553,7 @@ def add_bible(csv_file_path):
# Add USFM data to biblebooks

def upload_bible_data():
folder_path = 'bible'
folder_path = 'bible' #folder path to the respective data files
for filename in os.listdir(folder_path):
if filename.endswith('.csv'):
csv_file_path = os.path.join(folder_path, filename)
Expand All @@ -571,20 +570,13 @@ def upload_bible_data():
except Exception as e:
print(f"Failed to upload data for {resource_name}: {str(e)}")

#By this method you can validate file by file
# try:
# # Upload all entries for a single CSV file
# upload_data(data, bible_url)
# print(f"Success: All data for {resource_name} uploaded successfully.")
# except Exception as e:
# print(f"Failed to upload data for {resource_name}: {str(e)}")

#Call the function to upload data
upload_bible_data()

#4th
#Add vocabularies

#file paths to the respective data files
file_paths = [
'vocabularies/en_EBD_1_vocabulary.csv',
'vocabularies/hi_IRVD_1_vocabulary.csv',
Expand Down Expand Up @@ -623,6 +615,7 @@ def upload_bible_data():
#8th
#Add commentaries

#file paths to the respective data files
file_paths = [
'files/commentarydata_mr_BBC_1_commentary.txt',
'files/commentarydata_en_BBC_1_commentary.txt',
Expand Down

0 comments on commit a0687dc

Please sign in to comment.