-
Notifications
You must be signed in to change notification settings - Fork 1
/
transcriptListParse.m
96 lines (80 loc) · 2.92 KB
/
transcriptListParse.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
function [longHeader,longSequence,X]...
=transcriptListParse(transcriptList,cdnaHeader,cdnaSequence,ncrnaHeader,ncrnaSequence,params)
% params = struct('species','Mouse','verbose',1,...
% 'dir1','C:\FISHerMan\Db\Mouse.transcriptList.fas',...
% 'length',40,'number',24);
%% Pick transcript sequences that are longer than a certain threshold
if params(1).verbose
disp('reading the name file of target transcripts');
disp(' thresholding based on sequence length');
end
% use provided transcriptList as targets for designing FISH probes
% targetHeader are one-segment Header
[targetHeader,~]=fastaread(transcriptList);
targetHeader = targetHeader';
totalHeader = vertcat(cdnaHeader,ncrnaHeader);
totalSequence = vertcat(cdnaSequence,ncrnaSequence);
[totalHeader, totalSequence] = pickExpressedSeq(targetHeader, totalHeader, totalSequence);
longHeader = {};
longSequence = {};
for n = 1:length(totalSequence)
if length(totalSequence{n,1}) >= (params(1).length*params(1).number)
% this is to make sure longHeader are two-segment Header
pos=regexp(totalHeader{n,1},':');
if length(pos) >= 2
totalHeader{n,1}=totalHeader{n,1}(1:pos(2)-1);
end
longHeader{end+1,1} = totalHeader{n,1};
longSequence{end+1,1} = totalSequence{n,1};
end
end
if params(1).verbose
disp([' ' num2str(length(longHeader)) ' transcripts have length longer than '...
num2str(params(1).length*params(1).number) ' nts']);
end
%% Check transcript GC contents
GC=[];
for n = 1:length(longHeader)
G=length(strfind(longSequence{n,1},'G'));
C=length(strfind(longSequence{n,1},'C'));
GC=[GC;(G+C)/length(longSequence{n,1})*100];
end
hist(GC,0:5:100);
title('histogram of GC contents');
temp=axis;
temp(1)=0;
temp(2)=100;
axis(temp);
disp(' click to select the boundaries for GC contents');
[X,~]=ginput(2);
X=sort(X,'ascend');
X=floor(X);
if params(1).verbose
disp([' GC content threshold is from ' num2str(X(1)) '% to ' num2str(X(2)) '%']);
end
%% Segment each transcript sequence into 1kb pieces, so later analysis runs more efficiently
if params(1).verbose
disp(' segmenting long sequences into 1kb segments');
end
segHeader = {};
segSequence = {};
for n = 1:length(longHeader)
segN = floor(length(longSequence{n,1})/1000);
probe = longSequence{n,1};
for m = 1:segN
segHeader{end+1,1} = longHeader{n,1};
segSequence{end+1,1} = probe(1000*(m-1)+1:1000*m);
end
if length(probe) >= (1000*segN+30)
segHeader{end+1,1} = longHeader{n,1};
segSequence{end+1,1} = probe(1000*segN+1:end);
end
end
if params(1).verbose
disp('saving the target transcript fasta file');
end
transcriptList = [params(1).species '.target1kb.txt'];
if exist(transcriptList, 'file')
delete(transcriptList);
end
fastawrite(transcriptList, segHeader, segSequence);