optimization - Computation time insanely long in python -

i'm running code, consist in find average values. in ~6 million lines csv (ssm_resnik.txt), first row 1 reference, second row , third value 'distance' between 2 references. such distances arbitrarly defined biological criteria not important issue. of the references versus... of references, hence huge csv more 6 millions of lines. in csv (all_spot_uniprot.txt) have ~3600 spot's (first column), each of them 1 or more reference (third column). values same of huge csv. need compare each of 3600 of spot ref of second file other 3600-1 ref's in same file. of possible combinations, if exists, in first, huge csv file (ssm_resnik.txt). all_spot_uniprot.txt, each 2 spot ref's serve iterator correspondent reference (in third column) , iterate on huge csv file that, if exists, show value 2 "vs" reference.

what problem code? well... 10 seconds each iteration, 3600 *3600 *10 = 129.600.000 seconds = 1500 days (almost 5 years). happens in core i3, in mac. below code , portion of each file. please advice me. there code design flaw? there way reduce computation time? in advance...

import csv  spot_to_uniprot=open('all_spot_uniprot.txt', 'rbu') stu=csv.reader(spot_to_uniprot, delimiter='\t')  uniprot_vs_uniprot=open('ssm_resnik.txt', 'rbu') allvsall= csv.reader(uniprot_vs_uniprot, delimiter='\t')  recorder=open('tabela_final.csv', 'wb') fout=csv.writer(recorder, delimiter='\t', quotechar='"')  dict_stu={} #dicionário 'spot uniprot' dict_corresp={} #for each pair of uniprot ref key , value #a list of lists first list 1 spot , second list spot+1 dict_corresp_media={}##average of 1 spot other total_correspondencias_x_y=[] _lista_spot=[] lista_spot=[] lista_temp=[] lista_csv=[]  in stu:     _lista_spot.append(int(a[0]))     if a[0] not in dict_stu.keys():         dict_stu[a[0]]=[]         dict_stu[a[0]].append(a[2])     else:                 dict_stu[a[0]].append(a[2]) n_spot=max(_lista_spot)  spot_to_uniprot.close()  ##for aa in _lista_spot: ##    lista_spot.append(int(aa)) ##lista_spot.sort()  in allvsall:     lista_csv.append(i) tuple_csv=tuple(lista_csv)  uniprot_vs_uniprot.close()  h in range(1, n_spot):     _h in range(h+1, n_spot+1):         #print h, 'h da lista_spot'         del total_correspondencias_x_y[:]         total_correspondencias_x_y.append(dict_stu[str(h)])         #print h, 'h'         #print _h, '_h'         #print __h, '__h'         total_correspondencias_x_y.append(dict_stu[str(_h)])         print total_correspondencias_x_y, 'total_corresp_x_y'          c1 in total_correspondencias_x_y[0]:             if c1=='no data':                 pass             else:                 c2 in total_correspondencias_x_y[1]:                     if c2=='no data':                         pass                     else:                         #print c1, c2, 'c1 e c2'                         c3 in lista_csv:                             if c1 in c3[0]:                                 if c2 in c3[1]:                                     lista_temp.append(c3[2])                                     print lista_temp, 'lista_temp'          elements=len(lista_temp)         if len(lista_temp)==0:             dict_corresp_media[str(h)+'_'+str(_h)]=0         else:             temp_d=0             d in lista_temp:                 temp_d +=float(d)             media_spots=temp_d/elements             dict_corresp_media[str(h)+'_'+str(_h)]=media_spots          print dict_corresp_media[str(h)+'_'+str(_h)]         lista_temp=[]  recorder.close()

this portion of files:

all_spot_uniprot.txt

1   spr0001 q8drq4 1   sp0001  o08397 1   spn01072    b5e568 2   spr0002 p59651 2   sp0002  o06672 2   spn01074    b5e569 3   spr0005 q8drq2 3   sp0005  q97td1 3   spn01078    b5e572 4   spr0006 q8drq1 4   sp0006  q97td0 4   spn01079    b5e573 5   spr0009 q8drq0 5   sp0009  q97tc7 6   spr0010 q8drp9 6   sp0011  q97tc5 6   spn01085    b5e578 7   spr0012 p59652 7   sp0013  o69076 7   spn01087    b5e580 8   spr0017 q8drp6 8   sp0017  no data 8   spn01090    b5e5g4 9   spr0020 q8czd0 9   sp0018  q97tc2 9   spn01093    b5e5g7 10  spr0021 p65888 10  sp0019  p65887 ..  ......  ......  ...... ..  ......  ......  ...... 3617    spr2016 q8dmy7 3617    spr0324 q8dr62 3617    sp2211  no data 3617    sp1311  no data 3617    sp1441  no data 3617    spn11022    no data 3617    spn01038    no data 3617    spn08246    no data 3618    spr2018 q8dmy5 3618    sp0812  no data 3618    sp2213  no data 3618    spn04196    b5e3j0 3618    spn01040    b5e3v9 3619    spr2040 q8dmw6 3619    sp2234  q97n38 3619    spn01065    b5e462 3620    spr2043 p60243

ssm_resnik.txt

q8drq4  o08397  1.0 q8drq4  b5e568  1.0 q8drq4  p59651  0.12077157944440875 q8drq4  o06672  0.12077157944440875 q8drq4  b5e569  0.12077157944440875 q8drq4  q8drq1  0.12077157944440875 q8drq4  q97td0  0.12077157944440875 q8drq4  b5e573  0.12077157944440875 q8drq4  q8drp9  0.07139907404780385 q8drq4  q97tc5  0.07139907404780385 q8drq4  b5e578  0.07139907404780385 q8drq4  p59652  0.04789965413510797 q8drq4  o69076  0.04789965413510797 q8drq4  b5e580  0.04698170092888175 q8drq4  q8drp6  0.12077157944440875 q8drq4  p65888  0.05619465373456477 q8drq4  p65887  0.05619465373456477 q8drq4  b5e5g8  0.05619465373456477 q8drq4  q8drp3  0.0115283466875553 q8drq4  q97tc0  0.0115283466875553 q8drq4  b5e5g9  0.0115283466875553 q8drq4  q8drp2  0.05619465373456477 q8drq4  q97tb9  0.05619465373456477 q8drq4  b5e5h1  0.05619465373456477 q8drq4  q8drp0  0.12077157944440875 q8drq4  b5e5h3  0.12077157944440875 q8drq4  q8dni4  0.12077157944440875 q8drq4  q8cwp0  0.12077157944440875 q8drq4  q97cv3  0.12077157944440875 q8drq4  q97p52  0.12077157944440875 o08397  q97ph8  0.12077157944440875 o08397  p59200  0.10979991157849654 o08397  p59199  0.10979991157849654 o08397  b5e5i1  0.12077157944440875 o08397  q8drn5  0.047725405544042546 o08397  q97ta8  0.047725405544042546 o08397  b5e5i4  0.047725405544042546 o08397  q8drn4  0.1555714706579846 o08397  q97ta7  0.1555714706579846 o08397  b5e5i5  0.1555714706579846 o08397  q97ta6  0.02938784938305615 o08397  q8drn2  0.02938784938305615 o08397  q9f7t4  0.02938784938305615 o08397  p59653  0.04191624792292713 o08397  q03727  0.04191624792292713 o08397  b5e5j1  0.045754049904644475 o08397  p59654  0.01167129073292015 o08397  p36498  0.01167129073292015 o08397  b5e5j2  0.0 o08397  q8drm7  0.05619465373456477 o08397  q07296  0.05619465373456477 o08397  b5e5j3  0.05619465373456477 o08397  q97ta3  0.05619465373456477 o08397  b5e5j5  0.05619465373456477 o08397  q97t99  0.05619465373456477 o08397  q8drl9  0.05619465373456477 o08397  q97t96  0.05619465373456477 o08397  b5e5k1  0.05619465373456477 o08397  q97t95  0.05619465373456477 o08397  q8drl7  0.05619465373456477

6 million rows can either held in memory or in sqlite database. put there , make use of lookup optimizations offers:

with open('ssm_resnik.txt', 'rbu') uniprot_vs_uniprot:     reader = csv.reader(uniprot_vs_uniprot, delimiter='\t')     allvsall = { tuple(r[:2]): float(r[2]) r in reader }

now allvsall mapping offering o(1) lookups; saves having loop on whole 6 million rows each , every combination generate. lot of loops saved.

using collections.defaultdict easier when reading all_spot_uniprot list:

from collections import defaultdict  dict_stu = defaultdict(list) open('all_spot_uniprot.txt', 'rbu') spot_to_uniprot:     reader = csv.reader(spot_to_uniprot, delimiter='\t')     row in reader:         dict_stu[int(row[0])].append(row[2])

there no need find max value here, list keys , pass these itertools.permutations , itertools.product functions generate combinations.

the following code replicates yours in more compact form, fewer lists, , o(1) dictionary lookups fewer loops:

from itertools import permutations, product, ifilter  no_no_data = lambda v: v != 'no data' dict_corresp_media = {}  a, b in permutations(dict_stu.iterkeys(), r=2):     # retrieve , b lists of possible keys, need loop on products     # filter each `no data` keys     aval, bval = ifilter(no_no_data, dict_stu[a]), ifilter(no_no_data, dict_stu[b])      matches = [allvsall[c1, c2] c1, c2 in product(aval, bval) if (c1, c2) in allvsall]     dict_corresp_media['{}_{}'.format(a, b)] = sum(matches) / len(matches) if matches else 0

for input samples, spits out, in fraction of second:

>>> pprint.pprint(dict_corresp_media) {'10_1': 0,  '10_2': 0,  '10_3': 0,  '10_4': 0,  '10_5': 0,  '10_6': 0,  '10_7': 0,  '10_8': 0,  '10_9': 0,  '1_10': 0.05619465373456477,  '1_2': 0.12077157944440875,  '1_3': 0,  '1_4': 0.12077157944440875,  '1_5': 0,  '1_6': 0.07139907404780385,  '1_7': 0.04759366973303256,  '1_8': 0.12077157944440875,  '1_9': 0,  '2_1': 0,  '2_10': 0,  '2_3': 0,  '2_4': 0,  '2_5': 0,  '2_6': 0,  '2_7': 0,  '2_8': 0,  '2_9': 0,  '3_1': 0,  '3_10': 0,  '3_2': 0,  '3_4': 0,  '3_5': 0,  '3_6': 0,  '3_7': 0,  '3_8': 0,  '3_9': 0,  '4_1': 0,  '4_10': 0,  '4_2': 0,  '4_3': 0,  '4_5': 0,  '4_6': 0,  '4_7': 0,  '4_8': 0,  '4_9': 0,  '5_1': 0,  '5_10': 0,  '5_2': 0,  '5_3': 0,  '5_4': 0,  '5_6': 0,  '5_7': 0,  '5_8': 0,  '5_9': 0,  '6_1': 0,  '6_10': 0,  '6_2': 0,  '6_3': 0,  '6_4': 0,  '6_5': 0,  '6_7': 0,  '6_8': 0,  '6_9': 0,  '7_1': 0,  '7_10': 0,  '7_2': 0,  '7_3': 0,  '7_4': 0,  '7_5': 0,  '7_6': 0,  '7_8': 0,  '7_9': 0,  '8_1': 0,  '8_10': 0,  '8_2': 0,  '8_3': 0,  '8_4': 0,  '8_5': 0,  '8_6': 0,  '8_7': 0,  '8_9': 0,  '9_1': 0,  '9_10': 0,  '9_2': 0,  '9_3': 0,  '9_4': 0,  '9_5': 0,  '9_6': 0,  '9_7': 0,  '9_8': 0}

Search This Blog

Detect

optimization - Computation time insanely long in python -

Comments

Post a Comment

Popular posts from this blog

javascript - addthis share facebook and google+ url -

ios - Show keyboard with UITextField in the input accessory view -

c++ - importing crypto++ in QT application and occurring linker errors? -