Skip to content

Commit

Permalink
relexd skd duh d d thmp
Browse files Browse the repository at this point in the history
Creating masthead and usage notes for graph, but there's a lot of files and stats to keep track of.

I'm introducing the plan for the file "quick_stats.txt"

A summary file that provides tabular per-file k-mer counts, with associated metadata

I'm also at a comfortable state of the edge list generation loops.

There's a lot to be done and I don't know how much to document vs trusting in the intuitive nature of the interface.

Still fighting charges and working on personal issues, part time work.

Honest assessment of myself.
  • Loading branch information
MatthewRalston committed Mar 13, 2024
1 parent 768d685 commit b7edfaf
Show file tree
Hide file tree
Showing 4 changed files with 382 additions and 257 deletions.
136 changes: 82 additions & 54 deletions kmerdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1209,20 +1209,36 @@ def make_kdbg(arguments):
logger.debug("In other words, running the kmerdb.parse.parsefile() method on each file specified via the CLI")
if arguments.parallel > 1:
with Pool(processes=arguments.parallel) as pool:
data, files_metadata = pool.map(infile.parsefile, arguments.seqfile) # Returns a list of k-mer ids
# data files_metadata
data = pool.map(infile.parsefile, arguments.seqfile) # Returns a list of k-mer ids
else:
data, files_metadata = list(map(infile.parsefile, arguments.seqfile))
# data files_metadata
data = list(map(infile.parsefile, arguments.seqfile))
#edges = list(map(lambda kmer_id: , data))
#headers = list(map(lambda d: d[1], data))

# data should be list
# data = edges


print(data)
sys.exit(1)

metadata["files"] = files_metadata

validate_graph_metadata_spec(metadata)
sys.stderr.write("\n\n\tCompleted summation and metadata aggregation across all inputs...\n\n")

sys.stderr.write("nice")
#all_observed_kmers_in_files = int( #

meaningful_kmers = int(np.sum(list(map(lambda h: h["total_nullomers"] + h['total_kmers'] - 1, headers))))

"""
aggregate counts across files for .kdbg metadata header
"""
# Aggregate k-mer metadata across all input files.

meaningful_kmers = int(np.sum(list(map(lambda h: h["unique_kmers"] + h['total_nullomers'] - 1, headers))))
all_kmers = int(np.sum(list(map(lambda h: h["total_nullomers"] + h["total_kmers"], headers))))
total_kmers = int(np.sum(list(map(lambda h: h['total_kmers'], headers))))
total_nullomers = int(np.sum(list(map(lambda h: h["total_nullomers"], headers))))
# FIXME NO, STILL AWFUL
Expand Down Expand Up @@ -1259,14 +1275,16 @@ def make_kdbg(arguments):
"files": headers
})

try:
np.dtype(metadata["n1_dtype"])
np.dtype(metadata["n2_dtype"])
np.dtype(metadata["weights_dtype"])
except TypeError as e:
logger.error(e)
logger.error("kmerdb encountered a type error and needs to exit")
raise TypeError("Incorrect dtype.")

# # um
# try:
# np.dtype(metadata["n1_dtype"])
# np.dtype(metadata["n2_dtype"])
# np.dtype(metadata["weights_dtype"])
# except TypeError as e:
# logger.error(e)
# logger.error("kmerdb encountered a type error and needs to exit")
# raise TypeError("Incorrect dtype.")


N = len(edges[0].keys()) # edges is a list of dicts, where keys are a 2-tuple (e.g. (15633431, 12202077) ) representing an edge
Expand All @@ -1282,14 +1300,17 @@ def make_kdbg(arguments):

logger.info("Generated metadata for .kdbg...")

edges = graph.create_edges(kmer_ids)

graph = graph.make_graph(arguments)
"""
At this point, unpacking should be second nature but it took about 2 minutes to get this sorted out, rebuilding, recounting k-mers, and watching dota2.
"""
# print("'edges' type:'")
# print(type(edges))
# print(edges)
# sys.stderr.write("ALMOST DONE!!")
# sys.exit(1)
print("'edges' type:'")
print(type(edges))
print(edges)
sys.stderr.write("ALMOST DONE!!")
sys.exit(1)


"""
Expand All @@ -1299,18 +1320,6 @@ def make_kdbg(arguments):
# Step 1: over each file's weighted edge list: initialize the result dict
result = {}

print
for es in edges:
for i, e in enumerate(es.keys()):
try:
result[e] = 0
except KeyError as e:
logger.debug("Could not find a valid (empty or recorded) edge relationship in the {0}'th input file's weighted edgelist")
raise e
# Step 2: over each file's weighted edge list, accumulate all counts observed in the .fa/.fq files.
for es in edges:
for i, e in enumerate(es.keys()):
result[e] += es[e]
# Step 3: pretty print a table of results.

logger.debug("Storing all edges (node pairs) and weights in previously allocated numpy arrays")
Expand All @@ -1329,35 +1338,54 @@ def make_kdbg(arguments):
#df = pd.DataFrame(twoD_weighted_edge_list)
#df.to_csv(sys.stdout, sep=arguments.output_delimiter, index=False)

kdbg_out = graph.open(arguments.kdbg, 'wb', metadata=metadata)
with graph.open(arguments.kdbg, 'wb', metadata=metadata) as kdbg_out:


try:
sys.stderr.write("\n\n\nWriting edge list to {0}...\n\n\n".format(arguments.kdbg))
for i, node1 in enumerate(n1):
try:
sys.stderr.write("\n\n\nWriting edge list to {0}...\n\n\n".format(arguments.kdbg))
for i, node1 in enumerate(n1):

node2 = n2[i]
w = weights[i]
node2 = n2[i]
w = weights[i]

if arguments.edges is True:
print("{0}\t{1}\t{2}\t{3}".format(i, node1, node2, w))
# node1, node2, weight
kdbg_out.write("{0}\t{1}\t{2}\t{3}\n".format(i, node1, node2, w))
finally:
kdbg_out._write_block(kdbg_out._buffer)
kdbg_out._handle.flush()
kdbg_out._handle.close()

sys.stderr.write("Total k-mers processed: {0}\n".format(all_observed_kmers))
sys.stderr.write("Final nullomer count: {0}\n".format(total_nullomers))
sys.stderr.write("Unique {0}-mer count: {1}\n".format(arguments.k, unique_kmers))
sys.stderr.write("Total {0}-mer count: {1}\n".format(arguments.k, total_kmers))
sys.stderr.write("="*30 + "\n")
sys.stderr.write(".kdbg stats:\n")
sys.stderr.write("-"*30 + "\n")
sys.stderr.write("Edges in file: {0}\n".format(N))
sys.stderr.write("Non-zero weights: {0}\n".format(int(np.count_nonzero(weights))))
sys.stderr.write("\nDone\n")
tupley = (i, node1, node2, w)
tupley_dl = np.array(tupley, dtype="uint64")
if arguments.edges is True:
print("{0}\t{1}\t{2}\t{3}".format(i, node1, node2, w))
# node1, node2, weight
kdbg_out.write("{0}\t{1}\t{2}\t{3}\n".format(i, node1, node2, w))
finally:
kdbg_out._write_block(kdbg_out._buffer)
kdbg_out._handle.flush()
kdbg_out._handle.close()


"""
Done around nicoles birfday
ahhhh toopley
ahhaahahaha toopley
3/12/24
"""
sys.stderr.write("Total k-mers processed: {0}\n".format(all_observed_kmers))
sys.stderr.write("Final nullomer count: {0}\n".format(total_nullomers))
sys.stderr.write("Unique {0}-mer count: {1}\n".format(arguments.k, unique_kmers))
sys.stderr.write("Total {0}-mer count: {1}\n".format(arguments.k, total_kmers))
sys.stderr.write("="*30 + "\n")
sys.stderr.write(".kdbg stats:\n")
sys.stderr.write("-"*30 + "\n")
sys.stderr.write("Edges in file: {0}\n".format(N))
sys.stderr.write("Non-zero weights: {0}\n".format(int(np.count_nonzero(weights))))
sys.stderr.write("\nDone\n")

logger.info("Done printing weighted edge list to .kdbg")

Expand Down
117 changes: 98 additions & 19 deletions kmerdb/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,6 @@
https://matthewralston.github.io/kmerdb
===========================================
| P y P I |
===========================================
https://pypi.org/project/kmerdb/
===========================================
| G i t h u b |
===========================================
Expand All @@ -191,17 +186,56 @@
| N o t Very H u m e r u s |
============================================
https://matthewralston.github.io/blog/kmer-database-format-part-1
Please cite my repository in your work!
Feel free to e-mail me or reach out!
Please check the README.md for more details.
https://github.com/MatthewRalston/kmerdb
https://matthewralston.github.io/kmerdb/
"""








SPONGEBOB = """
⢀⣠⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣤⣄⡀
⣿⡋⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⠉⢙⣿
⣿⡇⠀⠀⠀⣠⣴⠾⠿⠷⣦⡀⢀⣴⠾⠿⠷⣦⣄⠀⠀⠀⢸⣿
⢸⡇⠀⠀⢰⡟⠁⠀⣀⣀⠈⢿⡿⠁⣀⣀⠀⠈⢻⡆⠀⠀⢸⡇
⢸⣷⠀⠀⢸⡇⠀⠀⠿⠿⠁⣸⣇⠈⠿⠿⠀⠀⢸⡇⠀⠀⣾⡇
⠘⣿⠀⠀⠈⠻⣦⣄⣀⣤⣾⠛⠛⣷⣤⣀⣠⣴⠟⠁⠀⠀⣿⠃
⠀⣿⠀⠘⢷⣄⠀⠉⠉⠙⢿⠄⠠⡿⠋⠉⠉⠀⣠⡾⠃⠀⣿⠀
⠀⣿⠀⠀⠀⠙⠻⢶⣦⣤⣤⣤⣤⣤⣤⣴⡶⠟⠋⠀⠀⠀⣿⠀
⠀⣿⡄⠀⠀⠀⠀⠀⣿⣀⣹⡇⢸⣏⣀⣿⠀⠀⠀⠀⠀⢠⣿⠀
⠀⢿⡇⠀⠀⠀⠀⠀⠉⠉⠉⠁⠈⠉⠉⠉⠀⠀⠀⠀⠀⢸⡿⠀
⠀⢸⣿⣿⣿⣿⣟⠛⠛⠛⣻⡿⢿⣟⠛⠛⠛⣻⣿⣿⣿⣿⡇⠀
⠀⢸⣿⣿⣿⣿⣿⣷⣤⣾⣿⣶⣶⣿⣷⣤⣾⣿⣿⣿⣿⣿⡇⠀
⠀⠘⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃⠘⣿⣿⣿⣿⣿⣿⣿⣿⣿⠃⠀
⠀⠀⠉⠉⠉⠉⠉⠉⠉⠉⠻⣧⣼⠟⠉⠉⠉⠉⠉⠉⠉⠉⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠁⠀⠀⠀⠀⠀⠀⠀
"""
DONE = """



DONE =

"""
"@->- -|>" ... @->-
... '"(---)"' , "-|> ?"'->
fax
- idlethoughts
==========================================
----------------D O N E-------------------
Expand Down Expand Up @@ -252,6 +286,61 @@
# MATRIX_MASTHEAD += i*"=" + "\n"


GRAPH_MASTHEAD = """
==========================================
kmerdb graph
==========================================
kmerdb graph example.fa example.kdbg --prefix temp_output
-----
.
.
.
output.kdbg
output.kdb
output.kdbi
output.kdb.gi
error.log
debug.log
output.kdbg.stats.neighbors.txt
output.kdbg.stats.edges.txt
output.kdbg.stats.adjacency.txt
---jk
output.kdbg.stats_txt
---------------------
distribution.txt
quick_stats.txt
---------------
kmers-per-file (array ok)
kmers-total :
kmers-avg :
singletons :
doublets :
triplets :
'meaningful_kmers = unique_kmers + total_kmers' (per file) : {0} = ' {1} ' ' + ' ' {2} ' |
grand_total_of_kmers :
--------------------------------------------------------------------------------------
(table structure
kmer_totals : (array okay) (per file)
unique kmers : (array okay) (per file)
nullomers : (array okay) (per file)
output.kdbg.stats.doublets
output.kdbg.stats.triplets
output.kdb.stats.average
output.k
output.kdbg.stats.paths.txt
"""



KMEANS_MASTHEAD = """
==========================================
Expand Down Expand Up @@ -325,18 +414,8 @@
I make no disclaimers.
################################
# W A R N I N G : M E M O R Y #
################################
# The following is some 'basic' guesses about expected memory usage.
# I believe this is an oversimplification of the memory profile for technical language and representation issues.
# The profile loading function reads the array into memory from the file, assuming a certain data encoding.
# The coding is now set to 'uint64' by default, which in theory allows us to only be limited by numpy array size.
# Now that we have that out of the way.
# The integer depth we have is large.
#
#
#
##################
# dimensionality reduction + kmeans
Expand Down
Loading

0 comments on commit b7edfaf

Please sign in to comment.