diff --git a/src/Makefile.am b/src/Makefile.am index d33e607..dbf79e3 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -26,4 +26,6 @@ rtree.c \ svg.c \ svg_landscape.c \ util.c \ -utree.c +utree.c \ +hash.c \ +list.c diff --git a/src/auto.c b/src/auto.c index d3db87c..9f16976 100644 --- a/src/auto.c +++ b/src/auto.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2015 Tomas Flouri + Copyright (C) 2015-2017 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as @@ -159,30 +159,6 @@ static int cb_short_trees(rtree_t * node) } -static void hash_tips(rtree_t * root) -{ - int i; - - /* obtain an array of pointers to tip names */ - rtree_t ** tipnodes = (rtree_t **)xmalloc((size_t)(root->leaves) * - sizeof(rtree_t *)); - rtree_query_tipnodes(root, tipnodes); - - /* create a libc hash table of size tip_count */ - hcreate(2*(size_t)(root->leaves)); - - /* populate a libc hash table with tree tip labels */ - for (i = 0; i < root->leaves; ++i) - { - ENTRY entry; - entry.key = tipnodes[i]->label; - entry.data = (void *)(tipnodes[i]); - hsearch(entry, ENTER); - } - free(tipnodes); -} - - static void set_encode_sequence(rtree_t * node, char * sequence, long seqlen, @@ -211,22 +187,47 @@ static void link_sequences(rtree_t * root, char ** headers, char ** sequence, lo { int i; + /* obtain an array of pointers to tip names */ + rtree_t ** tipnodes = (rtree_t **)xmalloc((size_t)(root->leaves) * + sizeof(rtree_t *)); + rtree_query_tipnodes(root, tipnodes); + + /* create a libc hash table of size tip_count */ + hashtable_t * ht = hashtable_create(root->leaves); + + /* populate a libc hash table with tree tip labels */ for (i = 0; i < root->leaves; ++i) { - ENTRY query; -// printf("Linking %s\n", headers[i]); - query.key = headers[i]; - ENTRY * found = NULL; + pair_t * pair = (pair_t *)xmalloc(sizeof(pair_t)); + pair->label = tipnodes[i]->label; + pair->index = i; - found = hsearch(query,FIND); + if (!hashtable_insert(ht, + (void *)pair, + hash_fnv(tipnodes[i]->label), + hashtable_paircmp)) + fatal("Duplicate taxon (%s)\n", tipnodes[i]->label); - if (!found) + } + + for (i = 0; i < root->leaves; ++i) + { + pair_t * query = hashtable_find(ht, + headers[i], + hash_fnv(headers[i]), + hashtable_paircmp); + + + if (!query) fatal("Sequence with header %s does not appear in the tree", headers[i]); - set_encode_sequence((rtree_t *)(found->data), sequence[i], seqlen, pll_map_nt); + set_encode_sequence(tipnodes[query->index], sequence[i], seqlen, pll_map_nt); } -} + free(tipnodes); + + hashtable_destroy(ht,free); +} static int all_pairwise_dist(rtree_t ** tip_node_list, int tip_list_count, long seqlen) { @@ -263,14 +264,9 @@ void detect_min_bl(rtree_t * rtree) seqlen = load_fasta(rtree->leaves, headers, seqdata); - hash_tips(rtree); - /* find sequences in hash table and link them with the corresponding taxa */ link_sequences(rtree, headers, seqdata, seqlen); - /* destroy hash table */ - hdestroy(); - /* get inner nodes that are roots of of the largest short subtrees. Short are such subtrees where all branch lengths within them are less or equal to opt_subtree_short. The largest such subtrees are those that are not diff --git a/src/hash.c b/src/hash.c new file mode 100644 index 0000000..073b14e --- /dev/null +++ b/src/hash.c @@ -0,0 +1,180 @@ +/* + Copyright (C) 2015-2017 Tomas Flouri + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + + Contact: Tomas Flouri , + Heidelberg Institute for Theoretical Studies, + Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany +*/ + +#include "mptp.h" + + +/* Daniel J. Bernstein 2a hash function */ +unsigned long hash_djb2a(char * s) +{ + unsigned long hash = 5381; + unsigned long c; + + while ((c = (unsigned long)*s++)) + hash = ((hash << 5) + hash) ^ c; /* hash*33 ^ c */ + + return hash; +} + +/* Fowler–Noll–Vo 1a hash function */ +unsigned long hash_fnv(char * s) +{ + unsigned long hash = 14695981039346656037UL; + unsigned long c; + + while ((c = (unsigned long)*s++)) + { + hash ^= c; + hash *= 1099511628211UL; + } + + return hash; +} + +static ht_item_t * hashitem_create(unsigned long key, void * value) +{ + ht_item_t * hi = (ht_item_t *)xmalloc(sizeof(ht_item_t)); + + hi->key = key; + hi->value = value; + + return hi; +} + +int hashtable_strcmp(void * x, void * y) +{ + return !strcmp((char *)x, (char *)y); +} + +int hashtable_ptrcmp(void * x, void * y) +{ + return (x == y); +} + +int hashtable_paircmp(void * stored, void * query) +{ + pair_t * stored_pair = (pair_t *)stored; + char * query_label = (char *)query; + + return !strcmp(stored_pair->label, query_label); +} + +void * hashtable_find(hashtable_t * ht, + void * x, + unsigned long hash, + int (*cb_cmp)(void *, void *)) +{ + unsigned long index = hash & (ht->table_size-1); + list_item_t * li = (list_item_t *)(ht->entries[index]->head); + + while (li) + { + ht_item_t * hi = (ht_item_t *)(li->data); + + if ((hash == hi->key) && cb_cmp(hi->value, x)) + return hi->value; + + li = li->next; + } + + return NULL; +} + + +hashtable_t * hashtable_create(unsigned long items_count) +{ + unsigned long i; + unsigned long size = 1; + + if (!items_count) return NULL; + + /* compute a size of at least double the items count that is a + multiple of 2 */ + items_count <<= 1; + while (size < items_count) + size <<= 1; + + /* allocate and init hash table */ + hashtable_t * ht = (hashtable_t *)xmalloc(sizeof(hashtable_t)); + ht->table_size = size; + ht->entries_count = 0; + + /* allocate and init entries array */ + ht->entries = (list_t **)xmalloc(size*sizeof(list_t *)); + for (i = 0; i < size; ++i) + { + ht->entries[i] = (list_t *)xmalloc(sizeof(list_t)); + memset(ht->entries[i], 0, sizeof(list_t)); + } + + return ht; +} + +int hashtable_insert(hashtable_t * ht, + void * x, + unsigned long hash, + int (*cb_cmp)(void *, void *)) +{ + /* size is always a multiple of 2 and greater than 2 */ + unsigned long index = hash & (ht->table_size-1); + + list_t * list = ht->entries[index]; + + + if (hashtable_find(ht, x, hash, cb_cmp)) + return 0; + + ht_item_t * item = hashitem_create(hash,x); + list_append(list, item); + + ht->entries_count++; + + return 1; +} + +void hashtable_destroy(hashtable_t * ht, void (*cb_dealloc)(void *)) +{ + unsigned long i; + + if (cb_dealloc) + { + for (i = 0; i < ht->table_size; ++i) + { + list_t * list = ht->entries[i]; + + list_item_t * head = list->head; + while (head) + { + ht_item_t * hi = (ht_item_t *)(head->data); + cb_dealloc(hi->value); + head = head->next; + } + } + } + + for (i = 0; i < ht->table_size; ++i) + { + list_clear(ht->entries[i], free); + free(ht->entries[i]); + } + free(ht->entries); + free(ht); +} diff --git a/src/list.c b/src/list.c new file mode 100644 index 0000000..85e66f8 --- /dev/null +++ b/src/list.c @@ -0,0 +1,98 @@ +/* + Copyright (C) 2015 Tomas Flouri + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + + Contact: Tomas Flouri , + Heidelberg Institute for Theoretical Studies, + Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany +*/ + +#include "mptp.h" + +#define DEF_LIST_APPEND 0 +#define DEF_LIST_PREPEND 1 + +static int list_insert(list_t * list, void * data, int where) +{ + if (!list) return 0; + + /* create list item */ + list_item_t * item = (list_item_t *)xmalloc(sizeof(list_item_t)); + item->data = data; + + /* if list is empty */ + if (!(list->count)) + { + list->head = list->tail = item; + list->count = 1; + item->next = NULL; + return 1; + } + + /* append */ + if (where == DEF_LIST_APPEND) + { + list->tail->next = item; + list->tail = item; + item->next = NULL; + list->count++; + return 1; + } + + /* prepend */ + item->next = list->head; + list->head = item; + list->count++; + + return 1; +} + +void list_append(list_t * list, void * data) +{ + list_insert(list, data, DEF_LIST_APPEND); +} + +void list_prepend(list_t * list, void * data) +{ + list_insert(list, data, DEF_LIST_PREPEND); +} + +void list_clear(list_t * list, void (*cb_dealloc)(void *)) +{ + if (!list) return; + + list_item_t * head = list->head; + + while (head) + { + list_item_t * temp = head; + head = head->next; + if (cb_dealloc) + cb_dealloc(temp->data); + free(temp); + } + + list->head = list->tail = NULL; + list->count = 0; +} + +list_t * list_create(void * data) +{ + list_t * list = (list_t *)xmalloc(sizeof(list_t)); + list->count = 0; + list_append(list, data); + + return list; +} diff --git a/src/mptp.h b/src/mptp.h index efd3e3c..c74c903 100644 --- a/src/mptp.h +++ b/src/mptp.h @@ -177,6 +177,38 @@ typedef struct pll_fasta long stripped[256]; } pll_fasta_t; +typedef struct list_item_s +{ + void * data; + struct list_item_s * next; +} list_item_t; + +typedef struct list_s +{ + list_item_t * head; + list_item_t * tail; + long count; +} list_t; + +typedef struct ht_item_s +{ + unsigned long key; + void * value; +} ht_item_t; + +typedef struct hashtable_s +{ + unsigned long table_size; + unsigned long entries_count; + list_t ** entries; +} hashtable_t; + +typedef struct pair_s +{ + char * label; + int index; +} pair_t; + /* macros */ #define MIN(a,b) ((a) < (b) ? (a) : (b)) @@ -313,9 +345,6 @@ rtree_t * rtree_clone(rtree_t * node, rtree_t * parent); int rtree_traverse_postorder(rtree_t * root, int (*cbtrav)(rtree_t *), rtree_t ** outbuffer); -rtree_t ** rtree_tipstring_nodes(rtree_t * root, - char * tipstring, - unsigned int * tiplist_count); rtree_t * get_outgroup_lca(rtree_t * root); rtree_t * rtree_lca(rtree_t * root, rtree_t ** tip_nodes, @@ -417,3 +446,40 @@ void aic_mcmc(rtree_t * tree, long seed, double * mcmc_min_logl, double * mcmc_max_logl); + +/* functions in hash.c */ + +unsigned long hash_djb2a(char * s); + +unsigned long hash_fnv(char * s); + +int hashtable_strcmp(void * x, void * y); + +int hashtable_ptrcmp(void * x, void * y); + +int hashtable_paircmp(void * stored, void * query); + +void * hashtable_find(hashtable_t * ht, + void * x, + unsigned long hash, + int (*cb_cmp)(void *, void *)); + +hashtable_t * hashtable_create(unsigned long items_count); + +int hashtable_insert(hashtable_t * ht, + void * x, + unsigned long hash, + int (*cb_cmp)(void *, void *)); + + +/* functions in list.c */ + +void list_append(list_t * list, void * data); + +void list_prepend(list_t * list, void * data); + +void list_clear(list_t * list, void (*cb_dealloc)(void *)); + +list_t * list_create(void * data); + +void hashtable_destroy(hashtable_t * ht, void (*cb_dealloc)(void *)); diff --git a/src/rtree.c b/src/rtree.c index 195a2c9..23bfc7d 100644 --- a/src/rtree.c +++ b/src/rtree.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2015 Tomas Flouri + Copyright (C) 2015-2017 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as @@ -427,9 +427,9 @@ rtree_t * rtree_clone(rtree_t * node, rtree_t * parent) return clone; } -rtree_t ** rtree_tipstring_nodes(rtree_t * root, - char * tipstring, - unsigned int * tiplist_count) +static rtree_t ** rtree_tipstring_nodes(rtree_t * root, + char * tipstring, + unsigned int * tiplist_count) { size_t i; unsigned int k; @@ -438,8 +438,6 @@ rtree_t ** rtree_tipstring_nodes(rtree_t * root, char * taxon; unsigned long taxon_len; - ENTRY * found = NULL; - for (i = 0; i < strlen(tipstring); ++i) if (tipstring[i] == ',') commas_count++; @@ -452,14 +450,19 @@ rtree_t ** rtree_tipstring_nodes(rtree_t * root, sizeof(rtree_t *)); /* create a hashtable of tip labels */ - hcreate(2 * (size_t)(root->leaves)); + hashtable_t * ht = hashtable_create(root->leaves); for (i = 0; i < (unsigned int)(root->leaves); ++i) { - ENTRY entry; - entry.key = node_list[i]->label; - entry.data = node_list[i]; - hsearch(entry,ENTER); + pair_t * pair = (pair_t *)xmalloc(sizeof(pair_t)); + pair->label = node_list[i]->label; + pair->index = i; + + if (!hashtable_insert(ht, + (void *)pair, + hash_fnv(node_list[i]->label), + hashtable_paircmp)) + fatal("Duplicate taxon (%s)\n", node_list[i]->label); } char * s = tipstring; @@ -475,16 +478,16 @@ rtree_t ** rtree_tipstring_nodes(rtree_t * root, taxon = xstrndup(s, taxon_len); /* search tip in hash table */ - ENTRY query; - query.key = taxon; - found = NULL; - found = hsearch(query,FIND); + pair_t * query = hashtable_find(ht, + taxon, + hash_fnv(taxon), + hashtable_paircmp); - if (!found) + if (!query) fatal("Taxon %s in does not appear in the tree", taxon); /* store pointer in output list */ - out_node_list[k++] = (rtree_t *)(found->data); + out_node_list[k++] = node_list[query->index]; /* free tip label, and move to the beginning of next tip if available */ free(taxon); @@ -494,7 +497,7 @@ rtree_t ** rtree_tipstring_nodes(rtree_t * root, } /* kill the hash table */ - hdestroy(); + hashtable_destroy(ht,free); free(node_list); diff --git a/src/utree.c b/src/utree.c index 6fb6191..ee4ea61 100644 --- a/src/utree.c +++ b/src/utree.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2015 Tomas Flouri + Copyright (C) 2015-2017 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as @@ -471,8 +471,6 @@ static utree_t ** utree_tipstring_nodes(utree_t * root, char * taxon; size_t taxon_len; - ENTRY * found = NULL; - for (i = 0; i < strlen(tipstring); ++i) if (tipstring[i] == ',') commas_count++; @@ -485,14 +483,19 @@ static utree_t ** utree_tipstring_nodes(utree_t * root, sizeof(utree_t *)); /* create a hashtable of tip labels */ - hcreate(2 * (size_t)utree_tip_count); + hashtable_t * ht = hashtable_create(utree_tip_count); for (i = 0; i < (unsigned int)utree_tip_count; ++i) { - ENTRY entry; - entry.key = node_list[i]->label; - entry.data = node_list[i]; - hsearch(entry,ENTER); + pair_t * pair = (pair_t *)xmalloc(sizeof(pair_t)); + pair->label = node_list[i]->label; + pair->index = i; + + if (!hashtable_insert(ht, + (void *)pair, + hash_fnv(node_list[i]->label), + hashtable_paircmp)) + fatal("Duplicate taxon (%s)\n", node_list[i]->label); } char * s = tipstring; @@ -508,16 +511,16 @@ static utree_t ** utree_tipstring_nodes(utree_t * root, taxon = xstrndup(s, taxon_len); /* search tip in hash table */ - ENTRY query; - query.key = taxon; - found = NULL; - found = hsearch(query,FIND); + pair_t * query = hashtable_find(ht, + taxon, + hash_fnv(taxon), + hashtable_paircmp); - if (!found) + if (!query) fatal("Taxon %s does not appear in the tree", taxon); /* store pointer in output list */ - out_node_list[k++] = (utree_t *)(found->data); + out_node_list[k++] = node_list[query->index]; /* free tip label, and move to the beginning of next tip if available */ free(taxon); @@ -527,7 +530,7 @@ static utree_t ** utree_tipstring_nodes(utree_t * root, } /* kill the hash table */ - hdestroy(); + hashtable_destroy(ht,free); free(node_list);