diff --git a/src/Makefile.am b/src/Makefile.am
index d33e607..dbf79e3 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -26,4 +26,6 @@ rtree.c \
svg.c \
svg_landscape.c \
util.c \
-utree.c
+utree.c \
+hash.c \
+list.c
diff --git a/src/auto.c b/src/auto.c
index d3db87c..9f16976 100644
--- a/src/auto.c
+++ b/src/auto.c
@@ -1,5 +1,5 @@
/*
- Copyright (C) 2015 Tomas Flouri
+ Copyright (C) 2015-2017 Tomas Flouri
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
@@ -159,30 +159,6 @@ static int cb_short_trees(rtree_t * node)
}
-static void hash_tips(rtree_t * root)
-{
- int i;
-
- /* obtain an array of pointers to tip names */
- rtree_t ** tipnodes = (rtree_t **)xmalloc((size_t)(root->leaves) *
- sizeof(rtree_t *));
- rtree_query_tipnodes(root, tipnodes);
-
- /* create a libc hash table of size tip_count */
- hcreate(2*(size_t)(root->leaves));
-
- /* populate a libc hash table with tree tip labels */
- for (i = 0; i < root->leaves; ++i)
- {
- ENTRY entry;
- entry.key = tipnodes[i]->label;
- entry.data = (void *)(tipnodes[i]);
- hsearch(entry, ENTER);
- }
- free(tipnodes);
-}
-
-
static void set_encode_sequence(rtree_t * node,
char * sequence,
long seqlen,
@@ -211,22 +187,47 @@ static void link_sequences(rtree_t * root, char ** headers, char ** sequence, lo
{
int i;
+ /* obtain an array of pointers to tip names */
+ rtree_t ** tipnodes = (rtree_t **)xmalloc((size_t)(root->leaves) *
+ sizeof(rtree_t *));
+ rtree_query_tipnodes(root, tipnodes);
+
+ /* create a libc hash table of size tip_count */
+ hashtable_t * ht = hashtable_create(root->leaves);
+
+ /* populate a libc hash table with tree tip labels */
for (i = 0; i < root->leaves; ++i)
{
- ENTRY query;
-// printf("Linking %s\n", headers[i]);
- query.key = headers[i];
- ENTRY * found = NULL;
+ pair_t * pair = (pair_t *)xmalloc(sizeof(pair_t));
+ pair->label = tipnodes[i]->label;
+ pair->index = i;
- found = hsearch(query,FIND);
+ if (!hashtable_insert(ht,
+ (void *)pair,
+ hash_fnv(tipnodes[i]->label),
+ hashtable_paircmp))
+ fatal("Duplicate taxon (%s)\n", tipnodes[i]->label);
- if (!found)
+ }
+
+ for (i = 0; i < root->leaves; ++i)
+ {
+ pair_t * query = hashtable_find(ht,
+ headers[i],
+ hash_fnv(headers[i]),
+ hashtable_paircmp);
+
+
+ if (!query)
fatal("Sequence with header %s does not appear in the tree", headers[i]);
- set_encode_sequence((rtree_t *)(found->data), sequence[i], seqlen, pll_map_nt);
+ set_encode_sequence(tipnodes[query->index], sequence[i], seqlen, pll_map_nt);
}
-}
+ free(tipnodes);
+
+ hashtable_destroy(ht,free);
+}
static int all_pairwise_dist(rtree_t ** tip_node_list, int tip_list_count, long seqlen)
{
@@ -263,14 +264,9 @@ void detect_min_bl(rtree_t * rtree)
seqlen = load_fasta(rtree->leaves, headers, seqdata);
- hash_tips(rtree);
-
/* find sequences in hash table and link them with the corresponding taxa */
link_sequences(rtree, headers, seqdata, seqlen);
- /* destroy hash table */
- hdestroy();
-
/* get inner nodes that are roots of of the largest short subtrees. Short are
such subtrees where all branch lengths within them are less or equal to
opt_subtree_short. The largest such subtrees are those that are not
diff --git a/src/hash.c b/src/hash.c
new file mode 100644
index 0000000..073b14e
--- /dev/null
+++ b/src/hash.c
@@ -0,0 +1,180 @@
+/*
+ Copyright (C) 2015-2017 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see .
+
+ Contact: Tomas Flouri ,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+
+/* Daniel J. Bernstein 2a hash function */
+unsigned long hash_djb2a(char * s)
+{
+ unsigned long hash = 5381;
+ unsigned long c;
+
+ while ((c = (unsigned long)*s++))
+ hash = ((hash << 5) + hash) ^ c; /* hash*33 ^ c */
+
+ return hash;
+}
+
+/* Fowler–Noll–Vo 1a hash function */
+unsigned long hash_fnv(char * s)
+{
+ unsigned long hash = 14695981039346656037UL;
+ unsigned long c;
+
+ while ((c = (unsigned long)*s++))
+ {
+ hash ^= c;
+ hash *= 1099511628211UL;
+ }
+
+ return hash;
+}
+
+static ht_item_t * hashitem_create(unsigned long key, void * value)
+{
+ ht_item_t * hi = (ht_item_t *)xmalloc(sizeof(ht_item_t));
+
+ hi->key = key;
+ hi->value = value;
+
+ return hi;
+}
+
+int hashtable_strcmp(void * x, void * y)
+{
+ return !strcmp((char *)x, (char *)y);
+}
+
+int hashtable_ptrcmp(void * x, void * y)
+{
+ return (x == y);
+}
+
+int hashtable_paircmp(void * stored, void * query)
+{
+ pair_t * stored_pair = (pair_t *)stored;
+ char * query_label = (char *)query;
+
+ return !strcmp(stored_pair->label, query_label);
+}
+
+void * hashtable_find(hashtable_t * ht,
+ void * x,
+ unsigned long hash,
+ int (*cb_cmp)(void *, void *))
+{
+ unsigned long index = hash & (ht->table_size-1);
+ list_item_t * li = (list_item_t *)(ht->entries[index]->head);
+
+ while (li)
+ {
+ ht_item_t * hi = (ht_item_t *)(li->data);
+
+ if ((hash == hi->key) && cb_cmp(hi->value, x))
+ return hi->value;
+
+ li = li->next;
+ }
+
+ return NULL;
+}
+
+
+hashtable_t * hashtable_create(unsigned long items_count)
+{
+ unsigned long i;
+ unsigned long size = 1;
+
+ if (!items_count) return NULL;
+
+ /* compute a size of at least double the items count that is a
+ multiple of 2 */
+ items_count <<= 1;
+ while (size < items_count)
+ size <<= 1;
+
+ /* allocate and init hash table */
+ hashtable_t * ht = (hashtable_t *)xmalloc(sizeof(hashtable_t));
+ ht->table_size = size;
+ ht->entries_count = 0;
+
+ /* allocate and init entries array */
+ ht->entries = (list_t **)xmalloc(size*sizeof(list_t *));
+ for (i = 0; i < size; ++i)
+ {
+ ht->entries[i] = (list_t *)xmalloc(sizeof(list_t));
+ memset(ht->entries[i], 0, sizeof(list_t));
+ }
+
+ return ht;
+}
+
+int hashtable_insert(hashtable_t * ht,
+ void * x,
+ unsigned long hash,
+ int (*cb_cmp)(void *, void *))
+{
+ /* size is always a multiple of 2 and greater than 2 */
+ unsigned long index = hash & (ht->table_size-1);
+
+ list_t * list = ht->entries[index];
+
+
+ if (hashtable_find(ht, x, hash, cb_cmp))
+ return 0;
+
+ ht_item_t * item = hashitem_create(hash,x);
+ list_append(list, item);
+
+ ht->entries_count++;
+
+ return 1;
+}
+
+void hashtable_destroy(hashtable_t * ht, void (*cb_dealloc)(void *))
+{
+ unsigned long i;
+
+ if (cb_dealloc)
+ {
+ for (i = 0; i < ht->table_size; ++i)
+ {
+ list_t * list = ht->entries[i];
+
+ list_item_t * head = list->head;
+ while (head)
+ {
+ ht_item_t * hi = (ht_item_t *)(head->data);
+ cb_dealloc(hi->value);
+ head = head->next;
+ }
+ }
+ }
+
+ for (i = 0; i < ht->table_size; ++i)
+ {
+ list_clear(ht->entries[i], free);
+ free(ht->entries[i]);
+ }
+ free(ht->entries);
+ free(ht);
+}
diff --git a/src/list.c b/src/list.c
new file mode 100644
index 0000000..85e66f8
--- /dev/null
+++ b/src/list.c
@@ -0,0 +1,98 @@
+/*
+ Copyright (C) 2015 Tomas Flouri
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see .
+
+ Contact: Tomas Flouri ,
+ Heidelberg Institute for Theoretical Studies,
+ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
+*/
+
+#include "mptp.h"
+
+#define DEF_LIST_APPEND 0
+#define DEF_LIST_PREPEND 1
+
+static int list_insert(list_t * list, void * data, int where)
+{
+ if (!list) return 0;
+
+ /* create list item */
+ list_item_t * item = (list_item_t *)xmalloc(sizeof(list_item_t));
+ item->data = data;
+
+ /* if list is empty */
+ if (!(list->count))
+ {
+ list->head = list->tail = item;
+ list->count = 1;
+ item->next = NULL;
+ return 1;
+ }
+
+ /* append */
+ if (where == DEF_LIST_APPEND)
+ {
+ list->tail->next = item;
+ list->tail = item;
+ item->next = NULL;
+ list->count++;
+ return 1;
+ }
+
+ /* prepend */
+ item->next = list->head;
+ list->head = item;
+ list->count++;
+
+ return 1;
+}
+
+void list_append(list_t * list, void * data)
+{
+ list_insert(list, data, DEF_LIST_APPEND);
+}
+
+void list_prepend(list_t * list, void * data)
+{
+ list_insert(list, data, DEF_LIST_PREPEND);
+}
+
+void list_clear(list_t * list, void (*cb_dealloc)(void *))
+{
+ if (!list) return;
+
+ list_item_t * head = list->head;
+
+ while (head)
+ {
+ list_item_t * temp = head;
+ head = head->next;
+ if (cb_dealloc)
+ cb_dealloc(temp->data);
+ free(temp);
+ }
+
+ list->head = list->tail = NULL;
+ list->count = 0;
+}
+
+list_t * list_create(void * data)
+{
+ list_t * list = (list_t *)xmalloc(sizeof(list_t));
+ list->count = 0;
+ list_append(list, data);
+
+ return list;
+}
diff --git a/src/mptp.h b/src/mptp.h
index efd3e3c..c74c903 100644
--- a/src/mptp.h
+++ b/src/mptp.h
@@ -177,6 +177,38 @@ typedef struct pll_fasta
long stripped[256];
} pll_fasta_t;
+typedef struct list_item_s
+{
+ void * data;
+ struct list_item_s * next;
+} list_item_t;
+
+typedef struct list_s
+{
+ list_item_t * head;
+ list_item_t * tail;
+ long count;
+} list_t;
+
+typedef struct ht_item_s
+{
+ unsigned long key;
+ void * value;
+} ht_item_t;
+
+typedef struct hashtable_s
+{
+ unsigned long table_size;
+ unsigned long entries_count;
+ list_t ** entries;
+} hashtable_t;
+
+typedef struct pair_s
+{
+ char * label;
+ int index;
+} pair_t;
+
/* macros */
#define MIN(a,b) ((a) < (b) ? (a) : (b))
@@ -313,9 +345,6 @@ rtree_t * rtree_clone(rtree_t * node, rtree_t * parent);
int rtree_traverse_postorder(rtree_t * root,
int (*cbtrav)(rtree_t *),
rtree_t ** outbuffer);
-rtree_t ** rtree_tipstring_nodes(rtree_t * root,
- char * tipstring,
- unsigned int * tiplist_count);
rtree_t * get_outgroup_lca(rtree_t * root);
rtree_t * rtree_lca(rtree_t * root,
rtree_t ** tip_nodes,
@@ -417,3 +446,40 @@ void aic_mcmc(rtree_t * tree,
long seed,
double * mcmc_min_logl,
double * mcmc_max_logl);
+
+/* functions in hash.c */
+
+unsigned long hash_djb2a(char * s);
+
+unsigned long hash_fnv(char * s);
+
+int hashtable_strcmp(void * x, void * y);
+
+int hashtable_ptrcmp(void * x, void * y);
+
+int hashtable_paircmp(void * stored, void * query);
+
+void * hashtable_find(hashtable_t * ht,
+ void * x,
+ unsigned long hash,
+ int (*cb_cmp)(void *, void *));
+
+hashtable_t * hashtable_create(unsigned long items_count);
+
+int hashtable_insert(hashtable_t * ht,
+ void * x,
+ unsigned long hash,
+ int (*cb_cmp)(void *, void *));
+
+
+/* functions in list.c */
+
+void list_append(list_t * list, void * data);
+
+void list_prepend(list_t * list, void * data);
+
+void list_clear(list_t * list, void (*cb_dealloc)(void *));
+
+list_t * list_create(void * data);
+
+void hashtable_destroy(hashtable_t * ht, void (*cb_dealloc)(void *));
diff --git a/src/rtree.c b/src/rtree.c
index 195a2c9..23bfc7d 100644
--- a/src/rtree.c
+++ b/src/rtree.c
@@ -1,5 +1,5 @@
/*
- Copyright (C) 2015 Tomas Flouri
+ Copyright (C) 2015-2017 Tomas Flouri
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
@@ -427,9 +427,9 @@ rtree_t * rtree_clone(rtree_t * node, rtree_t * parent)
return clone;
}
-rtree_t ** rtree_tipstring_nodes(rtree_t * root,
- char * tipstring,
- unsigned int * tiplist_count)
+static rtree_t ** rtree_tipstring_nodes(rtree_t * root,
+ char * tipstring,
+ unsigned int * tiplist_count)
{
size_t i;
unsigned int k;
@@ -438,8 +438,6 @@ rtree_t ** rtree_tipstring_nodes(rtree_t * root,
char * taxon;
unsigned long taxon_len;
- ENTRY * found = NULL;
-
for (i = 0; i < strlen(tipstring); ++i)
if (tipstring[i] == ',')
commas_count++;
@@ -452,14 +450,19 @@ rtree_t ** rtree_tipstring_nodes(rtree_t * root,
sizeof(rtree_t *));
/* create a hashtable of tip labels */
- hcreate(2 * (size_t)(root->leaves));
+ hashtable_t * ht = hashtable_create(root->leaves);
for (i = 0; i < (unsigned int)(root->leaves); ++i)
{
- ENTRY entry;
- entry.key = node_list[i]->label;
- entry.data = node_list[i];
- hsearch(entry,ENTER);
+ pair_t * pair = (pair_t *)xmalloc(sizeof(pair_t));
+ pair->label = node_list[i]->label;
+ pair->index = i;
+
+ if (!hashtable_insert(ht,
+ (void *)pair,
+ hash_fnv(node_list[i]->label),
+ hashtable_paircmp))
+ fatal("Duplicate taxon (%s)\n", node_list[i]->label);
}
char * s = tipstring;
@@ -475,16 +478,16 @@ rtree_t ** rtree_tipstring_nodes(rtree_t * root,
taxon = xstrndup(s, taxon_len);
/* search tip in hash table */
- ENTRY query;
- query.key = taxon;
- found = NULL;
- found = hsearch(query,FIND);
+ pair_t * query = hashtable_find(ht,
+ taxon,
+ hash_fnv(taxon),
+ hashtable_paircmp);
- if (!found)
+ if (!query)
fatal("Taxon %s in does not appear in the tree", taxon);
/* store pointer in output list */
- out_node_list[k++] = (rtree_t *)(found->data);
+ out_node_list[k++] = node_list[query->index];
/* free tip label, and move to the beginning of next tip if available */
free(taxon);
@@ -494,7 +497,7 @@ rtree_t ** rtree_tipstring_nodes(rtree_t * root,
}
/* kill the hash table */
- hdestroy();
+ hashtable_destroy(ht,free);
free(node_list);
diff --git a/src/utree.c b/src/utree.c
index 6fb6191..ee4ea61 100644
--- a/src/utree.c
+++ b/src/utree.c
@@ -1,5 +1,5 @@
/*
- Copyright (C) 2015 Tomas Flouri
+ Copyright (C) 2015-2017 Tomas Flouri
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
@@ -471,8 +471,6 @@ static utree_t ** utree_tipstring_nodes(utree_t * root,
char * taxon;
size_t taxon_len;
- ENTRY * found = NULL;
-
for (i = 0; i < strlen(tipstring); ++i)
if (tipstring[i] == ',')
commas_count++;
@@ -485,14 +483,19 @@ static utree_t ** utree_tipstring_nodes(utree_t * root,
sizeof(utree_t *));
/* create a hashtable of tip labels */
- hcreate(2 * (size_t)utree_tip_count);
+ hashtable_t * ht = hashtable_create(utree_tip_count);
for (i = 0; i < (unsigned int)utree_tip_count; ++i)
{
- ENTRY entry;
- entry.key = node_list[i]->label;
- entry.data = node_list[i];
- hsearch(entry,ENTER);
+ pair_t * pair = (pair_t *)xmalloc(sizeof(pair_t));
+ pair->label = node_list[i]->label;
+ pair->index = i;
+
+ if (!hashtable_insert(ht,
+ (void *)pair,
+ hash_fnv(node_list[i]->label),
+ hashtable_paircmp))
+ fatal("Duplicate taxon (%s)\n", node_list[i]->label);
}
char * s = tipstring;
@@ -508,16 +511,16 @@ static utree_t ** utree_tipstring_nodes(utree_t * root,
taxon = xstrndup(s, taxon_len);
/* search tip in hash table */
- ENTRY query;
- query.key = taxon;
- found = NULL;
- found = hsearch(query,FIND);
+ pair_t * query = hashtable_find(ht,
+ taxon,
+ hash_fnv(taxon),
+ hashtable_paircmp);
- if (!found)
+ if (!query)
fatal("Taxon %s does not appear in the tree", taxon);
/* store pointer in output list */
- out_node_list[k++] = (utree_t *)(found->data);
+ out_node_list[k++] = node_list[query->index];
/* free tip label, and move to the beginning of next tip if available */
free(taxon);
@@ -527,7 +530,7 @@ static utree_t ** utree_tipstring_nodes(utree_t * root,
}
/* kill the hash table */
- hdestroy();
+ hashtable_destroy(ht,free);
free(node_list);