Skip to content

Commit

Permalink
Merge pull request #36 from pbashyal-nmdp/version_0.0.14
Browse files Browse the repository at this point in the history
Added Example File
  • Loading branch information
mmaiers-nmdp authored Feb 23, 2024
2 parents a94bce0 + 4a68087 commit d24780b
Show file tree
Hide file tree
Showing 8 changed files with 68 additions and 44 deletions.
54 changes: 25 additions & 29 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,19 @@ pip install py-graph-imputation

#### Get Frequency Data and Subject Data and Configuration File

For an example, copy the folders to your working directory:
- https://github.com/nmdp-bioinformatics/py-graph-imputation/tree/master/data
- https://github.com/nmdp-bioinformatics/py-graph-imputation/tree/master/conf
For an example, get [example-conf-data.zip](https://github.com/nmdp-bioinformatics/py-graph-imputation/tree/master/example-conf-data.zip)

so it appears as:
Unzip the folder so it appears as:

```
.
├── conf
│   ├── README.md
│   └── minimal-configuration.json
├── data
│   ├── freqs
│   │   └── CAU.freqs.gz
│   └── subjects
│   └── donor.csv
conf
|-- README.md
`-- minimal-configuration.json
data
|-- freqs
| `-- CAU.freqs.gz
`-- subjects
`-- donor.csv
```

#### Modify the configuration.json to suit your need
Expand All @@ -59,9 +56,9 @@ Writing hpf File: output/hpf.csv
This will produce the files which will be used for graph generation:

```
├── output
│   ├── hpf.csv # CSV file of Haplotype, Populatio, Freq
│   └── pop_counts_file.txt # Size of each population
output
|-- hpf.csv # CSV file of Haplotype, Populatio, Freq
`-- pop_counts_file.txt # Size of each population
```

#### Generate the Graph (nodes and edges) files
Expand All @@ -81,13 +78,12 @@ Performing graph generation based on following configuration:
This will produce the following files:

```
├── output
│   ├── csv
│   │   ├── edges.csv
│   │   ├── info_node.csv
│   │   ├── nodes.csv
│   │   └── top_links.csv
output
`-- csv
|-- edges.csv
|-- info_node.csv
|-- nodes.csv
`-- top_links.csv
```

#### Produce Imputation Results for Subjects
Expand Down Expand Up @@ -133,12 +129,12 @@ This will produce files in `output` directory as:

```
├── output
   ├── don.miss # Cases that failed imputation (e.g. incorrect typing etc.)
   ├── don.pmug # Phased imputation as PMUG GL String
   ├── don.pmug.pops # Population for Phased Imputation
   ├── don.problem # List of errors
   ├── don.umug # Unphased imputation as UMUG GL String
   ├── don.umug.pops # Population for Phased Imputation
│ ├── don.miss # Cases that failed imputation (e.g. incorrect typing etc.)
│ ├── don.pmug # Phased imputation as PMUG GL String
│ ├── don.pmug.pops # Population for Phased Imputation
│ ├── don.problem # List of errors
│ ├── don.umug # Unphased imputation as UMUG GL String
│ ├── don.umug.pops # Population for Phased Imputation
```


Expand Down
Binary file added example-conf-data.zip
Binary file not shown.
1 change: 1 addition & 0 deletions graph_generation/generate_neo4j_multi_hpf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
# 26M edges (3.6B)and 107M top links (200MB).
# FULL_LOCI = 'ABCQR'


##############################################################################
# functions
##############################################################################
Expand Down
2 changes: 1 addition & 1 deletion grim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@
"""Top-level package for py-grim."""

__organization__ = "NMDP/CIBMTR Bioinformatics"
__version__ = "0.0.13"
__version__ = "0.1.0"
4 changes: 2 additions & 2 deletions grim/imputation/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,7 +949,7 @@ def open_phases(self, haps, N_Loc, gl_string):
optionDict = {} # set()
if len(fq) == 0:
_list = []
for (gen, name) in self.cypher.loc_map.items():
for gen, name in self.cypher.loc_map.items():
count = 0
for i in range(len(hap_list[0])):
if hap_list[0][i].split("*", 1)[0] == gen:
Expand Down Expand Up @@ -2017,7 +2017,7 @@ def impute_file(self, config, planb=None, em_mr=False, em=False): ##em
problem = open(config["imputation_out_problem_file"], "w")

with f as lines:
for (i, name_gl) in enumerate(lines):
for i, name_gl in enumerate(lines):
try:
name_gl = name_gl.rstrip() # remove trailing whitespace
if "," in name_gl:
Expand Down
47 changes: 37 additions & 10 deletions grim/imputation/networkx_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,20 @@ def build_graph(self, nodesFile, edgesFile, allEdgesFile):
if not self.nodes_plan_a or row[2] in self.nodes_plan_a:
self.Vertices.append(row[1])
vertex_id = len(self.Vertices) - 1
self.Vertices_attributes[row[1]] = (row[2], list(map(float, row[3].split(";"))), vertex_id)
self.Vertices_attributes[row[1]] = (
row[2],
list(map(float, row[3].split(";"))),
vertex_id,
)

if not self.nodes_plan_b or row[2] in self.nodes_plan_b:
self.Whole_Vertices.append(row[1])
vertex_id = len(self.Whole_Vertices) - 1
self.Whole_Vertices_attributes[row[1]] = (
row[2], list(map(float, row[3].split(";"))), vertex_id)
row[2],
list(map(float, row[3].split(";"))),
vertex_id,
)

nodesDict[row[0]] = row[1]

Expand All @@ -70,7 +77,10 @@ def build_graph(self, nodesFile, edgesFile, allEdgesFile):
node2_id = row[1]
node1 = nodesDict[node1_id]
node2 = nodesDict[node2_id]
if node1 in self.Vertices_attributes and node2 in self.Vertices_attributes:
if (
node1 in self.Vertices_attributes
and node2 in self.Vertices_attributes
):
node1_label = self.Vertices_attributes[node1][0]
if node1_label == self.full_loci:
self.Edges.append([node2_id, node1_id])
Expand Down Expand Up @@ -145,7 +155,9 @@ def build_graph(self, nodesFile, edgesFile, allEdgesFile):
del sorted_indices

# Create a list of the first appearance of a number in the 0 column in the matrix
unique_values, first_occurrences_indices = np.unique(self.Edges[:, 0], return_index=True)
unique_values, first_occurrences_indices = np.unique(
self.Edges[:, 0], return_index=True
)

j = 0
for i in range(0, self.Vertices.shape[0]):
Expand All @@ -162,7 +174,9 @@ def build_graph(self, nodesFile, edgesFile, allEdgesFile):
del unique_values, first_occurrences_indices

# Create a list of the first appearance of a number in the 0 column in the matrix
unique_values, first_occurrences_indices = np.unique(self.Whole_Edges[:, 0], return_index=True)
unique_values, first_occurrences_indices = np.unique(
self.Whole_Edges[:, 0], return_index=True
)

j = 0
for i in range(0, self.Whole_Vertices.shape[0]):
Expand All @@ -182,7 +196,9 @@ def build_graph(self, nodesFile, edgesFile, allEdgesFile):
self.Whole_Neighbors_start.append(int(len(self.Whole_Vertices)))

self.Neighbors_start = np.array(self.Neighbors_start, dtype=np.uint32)
self.Whole_Neighbors_start = np.array(self.Whole_Neighbors_start, dtype=np.uint32)
self.Whole_Neighbors_start = np.array(
self.Whole_Neighbors_start, dtype=np.uint32
)

# Take the first column out of the Edges arrays
### Do the following to massive save of memory
Expand Down Expand Up @@ -249,7 +265,13 @@ def adjs_query(self, alleleList):
allele_id = self.Vertices_attributes[allele][2]
# Find the neighbors of the allele
allele_neighbors = self.Vertices[
self.Edges[range(self.Neighbors_start[allele_id], self.Neighbors_start[allele_id + 1])]]
self.Edges[
range(
self.Neighbors_start[allele_id],
self.Neighbors_start[allele_id + 1],
)
]
]
# The frequencies of the neighbors to the dictionary
for adj in allele_neighbors:
adjDict[adj] = self.Vertices_attributes[adj][1]
Expand All @@ -271,9 +293,14 @@ def adjs_query_by_color(self, alleleList, labelA, labelB):

if connector in self.Whole_Vertices_attributes:
connector_id = self.Whole_Vertices_attributes[connector]
alleles = self.Whole_Vertices[self.Whole_Edges[range(self.Whole_Neighbors_start[connector_id],
self.Whole_Neighbors_start[
connector_id + 1])]]
alleles = self.Whole_Vertices[
self.Whole_Edges[
range(
self.Whole_Neighbors_start[connector_id],
self.Whole_Neighbors_start[connector_id + 1],
)
]
]

for adj in alleles:
adjDict[adj] = self.Whole_Vertices_attributes[adj][1]
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.0.13
current_version = 0.1.0
commit = True
tag = True

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@

setup(
name="py-graph-imputation",
version="0.0.13",
version="0.1.0",
author="Pradeep Bashyal",
author_email="[email protected]",
python_requires=">=3.8",
Expand Down

0 comments on commit d24780b

Please sign in to comment.