//Ling-Hong Hung Mar 2012
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include "lite.h"
#include "error_handlers.h"
#include <omp.h>
#define MAX_ELEMENTS 8192*8192
#include <iostream>

using namespace std;

//subroutines for all v all TMscore, list versus referenc TMscore/rmsd, all v all TMSCORE
int find_tmscore_matrix(int cpu_flag,int nt, int nwg_per_cu,char *source,int nats,int nstructs, float *coords, float **matrix);

//CPU version of TMScore using hybrid Kabsch/quaternion
float tmscore_cpu(int nat,float *coords1,float *coords2,float bR[3][3], float bt[3]);

//version of TMscore scoring/extension routine with aligned coordinate buffering - faster

//routines for OpenCL version of TM-score
int find_tmscore_matrix(int single_thread,int cpu_flag,int nt, int nwg_per_cu,char *source,int nats,int nstructs, float *coords, float **matrix);
int convert_coords_to_float4 (int nstructs,int pdb_size, float *coords, float4 *coords4);
int calculate_number_of_frames(int nat);
int define_sizes_string (char **defines_string, int nthreads, int pdb4_size);
int read_source_file(char **array,char *filename,char *defines_string);
int define_decoy_sizes_string (char **defines_string, int nthreads, int pdb4_size);

//input output routines
int read_list_of_decoys(int *nat, char *filename,float **coords,char **names, int **name_offsets, int mode);
int read_CAs(char *filename, float *coords, int count_only,int center_coords);

//optimised Kabsch routine to be used with eigenvector matrix calculation
double u3b_opt2_d(int nat,double *my_coords,double u[3][3], double t[3]);
void rmatrix_d(double ev,double r[3][3],double u[3][3]);
int score_fun_dcoords(int nat, float d0, float d, double R[3][3], double t[3],double *coords1, double *coords2,double *acoords,int *ialign,int *nalign,float *tm_score);

int find_tmscore_matrix(int cpu_flag,int nt, int nwg_per_cu,char *source,int nats,int nstructs, float *coords, float **matrix)
{
 int nats4=(nats%4)?nats/4+1:nats/4;
 int pdb_size=3*nats,pdb4_size=3*nats4;
 char *defines_string=0,*kernel_source=0;
 double start_program=0,start_rmsd=0,end=0;
 float4 *coords4;

 //add define string to source to specify maximum size
 define_sizes_string (&defines_string,nt,pdb4_size);
 read_source_file(&kernel_source,source,defines_string);

 //openCL
 cl_int4 sizes,start_points;
 cl_platform_id platform;
 cl_device_id device;
 cl_context context;
 cl_command_queue queue;
 cl_program program;
 cl_kernel tmscore_matrix,tmscore_matrix_rect;
 cl_mem tmscores_buffer,coords41_buffer,coords42_buffer;
 cl_float2 *tmscores;
 cl_int err;
 cl_uint ncu,num_of_devices;
 clGetPlatformIDs( 1, &platform, NULL ); 
 
 // try to get a supported GPU device
 //test with CPU
 if(cpu_flag)
 { 
  if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device,&num_of_devices) == CL_SUCCESS)
  {
   clGetDeviceInfo(device,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&ncu,NULL); 
   fprintf(stderr,"using cpu %d cores found\n",ncu);
  }
 }
 else
 {
  if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device,&num_of_devices) != CL_SUCCESS)
  {
   fprintf(stderr, "no gpu found - running with cpu");
   if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device,&num_of_devices) != CL_SUCCESS) exit(FALSE);
   else fprintf(stderr, "using cpu\n"); 
   clGetDeviceInfo(device,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&ncu,NULL); 
  }
  else 
  {
   clGetDeviceInfo(device,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&ncu,NULL); 
   fprintf(stderr,"%d gpu cards with %d compute units found\n",num_of_devices,ncu);
  }
 }
 context = clCreateContext(NULL,1,&device,NULL,NULL,&err);
 queue = clCreateCommandQueue(context, device, 0, &err); 
 
 //calculate maximum number of workgroups per compute unit
 //for gpus this depends on the memory available
 int lds=1024*32; //local cache size per cu - but at least 2 workgroups/cu are active at any one time so half of this is really available
 if(cpu_flag)lds*=2; 
  //memory used is memory to cache coords plus memory for the alignment and reduction - this is different from the simple tmscore where the coords memory is used once
 int mem_per_wg=(6*nats4*sizeof(float4)>nt*sizeof(float4)) ?6*nats4*sizeof(float4) : nt*sizeof(float4);
 int max_wg_per_cu=lds/mem_per_wg/2;

 if( max_wg_per_cu <1) max_wg_per_cu =1;
 if(nwg_per_cu)max_wg_per_cu=nwg_per_cu;
 unsigned int max_nwg=(max_wg_per_cu)*ncu;
 start_rmsd = omp_get_wtime();  
 program = clCreateProgramWithSource(context,1,(const char**)&kernel_source, NULL,&err);
 if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
 {
  printf("Error building program\n");
  char buf[0x10000];
  clGetProgramBuildInfo( program,device,CL_PROGRAM_BUILD_LOG,0x10000,buf,NULL);
  fprintf(stderr,"\n%s\n", buf);
  return 1;
 }
 end = omp_get_wtime(); 
 fprintf(stderr, "%8.3f seconds elapsed for program generation\n",end-start_rmsd);
 start_rmsd = omp_get_wtime();  

 //start_rmsd = omp_get_wtime();  
 int max_structs_for_coords=(int)(MAX_ELEMENTS/pdb4_size);
 int max_structs_for_matrix=(int)sqrt((float) MAX_ELEMENTS/sizeof(float2));
 int max_structs=(max_structs_for_coords < max_structs_for_matrix)? max_structs_for_coords : max_structs_for_matrix;

 if(!max_structs){fprintf(stderr,"insufficient memory to load structure\n");exit(FALSE);} 
 if(nstructs<max_structs)max_structs=nstructs;
 
 int ngrid=(nstructs%max_structs)?nstructs/max_structs+1 : nstructs/max_structs; //size of the grid of tiles - calculation is split into ngrid*(ngrid-1)/2 submatrices for large number of structures
 
 int block_matrix_size_tr=max_structs*(max_structs-1)/2;
 int block_matrix_size_sq=max_structs*max_structs;


 //create hosts arrays and buffers
 if (!(coords4 = (float4*)  malloc(pdb4_size * nstructs *sizeof(float4)))) exit(FALSE);
 convert_coords_to_float4 (nstructs,pdb_size,coords,coords4);

 tmscore_matrix = clCreateKernel(program, "tmscore_matrix", &err);
 coords41_buffer = clCreateBuffer(context,CL_MEM_READ_ONLY, max_structs*pdb4_size * sizeof(float4),NULL, NULL );

 if(ngrid >1)
 {
  tmscore_matrix_rect = clCreateKernel(program, "tmscore_matrix_rect", &err);
  coords42_buffer = clCreateBuffer(context,CL_MEM_READ_ONLY, max_structs*pdb4_size * sizeof(float4),NULL, NULL );
  tmscores_buffer = clCreateBuffer(context,CL_MEM_READ_WRITE, block_matrix_size_sq * sizeof(cl_float2),NULL, NULL);
  if((!(tmscores=(cl_float2*)malloc(sizeof(cl_float2)*block_matrix_size_sq))))exit(FALSE);  

 }
 else
 {
  tmscores_buffer = clCreateBuffer(context,CL_MEM_READ_WRITE, block_matrix_size_tr * sizeof(cl_float2),NULL, NULL);
  if((!(tmscores=(cl_float2*)malloc(sizeof(cl_float2)*block_matrix_size_tr))))exit(FALSE);  
 }
 int nseeds=calculate_number_of_frames(nats);

 sizes.x=nats;sizes.y=nats4;
 //indices need to be worked out
 for (int ni=0;ni<ngrid;ni++)
 {
  //triangular tiles first - calculate the on diagonal submatrices
  //calculate block_size
  int block_structs=(max_structs<nstructs-ni*max_structs) ? max_structs : nstructs-ni*max_structs;
  int nwu= block_structs*(block_structs-1)/2;
  int offset=ni*max_structs;
  sizes.z=block_structs;
  sizes.w=nwu;
  start_points.x=0;start_points.y=0;start_points.z=block_structs;start_points.w=block_structs;
  clSetKernelArg(tmscore_matrix, 0,sizeof(cl_int4),&sizes); 
  clSetKernelArg(tmscore_matrix, 1,sizeof(int),&nseeds);
  clSetKernelArg(tmscore_matrix, 2,sizeof(cl_int4),&start_points);
  clSetKernelArg(tmscore_matrix, 3,sizeof(coords41_buffer),&coords41_buffer);
  clSetKernelArg(tmscore_matrix, 4,sizeof(tmscores_buffer),&tmscores_buffer);
  clEnqueueWriteBuffer(queue, coords41_buffer , CL_TRUE, 0,block_structs*pdb4_size * sizeof(float4),&(coords4[ni*max_structs*pdb4_size]), 0, NULL, NULL); 
  clFinish( queue);
  size_t global,local=nt;
  global=max_nwg*local;
  clEnqueueNDRangeKernel(queue, tmscore_matrix, 1, NULL, &global, &local, 0, NULL, NULL);
  clFinish(queue);
  clEnqueueReadBuffer(queue, tmscores_buffer, CL_TRUE, 0,nwu*sizeof(cl_float2),tmscores,0,NULL,NULL);
  clFinish( queue);

  //output to matrix
  {
   int m=0;
   for(int i=0;i<block_structs-1;i++)
    for(int j=i+1;j<block_structs;j++)
    {
     matrix[i+offset][j+offset]=tmscores[m].x;
     m++;
    }
  }
 }
 for (int ni=0;ni<ngrid-1;ni++)
  for (int nj=ni+1;nj<ngrid;nj++)
   if(ni!=nj)
   {
    //rectangular tile
    int block_structs1=(max_structs<nstructs-ni*max_structs) ? max_structs : nstructs-ni*max_structs;
    int block_structs2=(max_structs<nstructs-nj*max_structs) ? max_structs : nstructs-nj*max_structs;
    int nwu= block_structs1*block_structs2;
    int offset1=ni*max_structs;
    int offset2=nj*max_structs;
    sizes.z=block_structs1;
    sizes.w=nwu;
    fprintf(stderr," ni %d nj %d nwu %d block_sizes %d %d grid_size %d\n",ni,nj,nwu,block_structs1,block_structs2,ngrid);

    start_points.x=0;start_points.y=0;start_points.z=block_structs1;start_points.w=block_structs2;
    clSetKernelArg(tmscore_matrix_rect, 0,sizeof(cl_int4),&sizes); 
    clSetKernelArg(tmscore_matrix_rect, 1,sizeof(int),&nseeds);
    clSetKernelArg(tmscore_matrix_rect, 2,sizeof(cl_int4),&start_points);
    clSetKernelArg(tmscore_matrix_rect, 3,sizeof(coords41_buffer),&coords41_buffer);
    clSetKernelArg(tmscore_matrix_rect, 4,sizeof(coords42_buffer),&coords42_buffer);
    clSetKernelArg(tmscore_matrix_rect, 5,sizeof(tmscores_buffer),&tmscores_buffer);
    clEnqueueWriteBuffer(queue, coords41_buffer , CL_TRUE, 0,block_structs1*pdb4_size * sizeof(float4),&(coords4[ni*max_structs*pdb4_size]), 0, NULL, NULL); 
    clEnqueueWriteBuffer(queue, coords42_buffer , CL_TRUE, 0,block_structs2*pdb4_size * sizeof(float4),&(coords4[nj*max_structs*pdb4_size]), 0, NULL, NULL); 
    clFinish( queue );
    size_t global,local=nt;
    global=max_nwg*local;
    clEnqueueNDRangeKernel(queue, tmscore_matrix_rect, 1, NULL, &global, &local, 0, NULL, NULL);
    clFinish( queue );
    clEnqueueReadBuffer(queue, tmscores_buffer, CL_TRUE, 0,nwu*sizeof(cl_float2),tmscores,0,NULL,NULL);
    clFinish( queue);
    {
     int m=0;
     for(int i=0;i<block_structs1;i++)
      for(int j=0;j<block_structs2;j++)
      {
       matrix[i+offset1][j+offset2]=tmscores[m].x;
       m++;
      }
    }
   }
 fprintf(stderr,"finished\n");  
 clReleaseMemObject(coords41_buffer);
 clReleaseMemObject(tmscores_buffer);
 clReleaseProgram(program);
 clReleaseKernel(tmscore_matrix);
 end = omp_get_wtime();  
 fprintf(stderr, "%8.3f seconds elapsed for %d TM-scores at %8.3f ms per TM-score\n",end-start_rmsd,nstructs*(nstructs-1)/2,(float)((end-start_rmsd)*1000)/(float)(nstructs*(nstructs-1)/2));
  
 if(ngrid >1)
 {
 clReleaseMemObject(coords42_buffer);
 clReleaseKernel(tmscore_matrix_rect);
 }
 clReleaseCommandQueue(queue);
 clReleaseContext(context);
 if(coords4)free(coords4);
 if(tmscores)free(tmscores);
 if(defines_string)free(defines_string);
 if(kernel_source)free(kernel_source);
 //symmetrize matrix
 for (int i=0; i<nstructs;i++)
  for (int j=i; j<nstructs;j++)
  {
   if(i==j)matrix[i][j]=0;
   else matrix[j][i]=matrix[i][j];
  }
}

int define_sizes_string (char **defines_string, int nthreads, int pdb4_size)
{
 char buffer[1000];
 int n;
 n=sprintf(buffer,"#define NTHREADS %d\n#define PDB4SIZE %d\n",nthreads,pdb4_size);
 if(*defines_string)free(*defines_string);
 if(!(*defines_string=(char*)malloc(sizeof(char)*(n+1))))exit(FALSE); 
 strcpy(*defines_string,buffer);
 return(n);
}
int read_list_of_decoys(int *nat, char *filename,float **coords,char **names, int **name_offsets)
{
 //structures are assumed to be identical - only CA read in
 FILE *list_fp;
 char *my_names=0;
 char line[LINE_LENGTH],pdb_filename[FILENAME_LENGTH];
 int m=0,pdb_size=0,nstructs=0,my_nat=0,name_length=0,current_offset,*my_offsets=0;
 open_file(&list_fp, filename, "r", "read_list_of_structures"); 
 while (fgets(line, LINE_LENGTH, list_fp))
  if(line[0] != '\n')
  {
   check_eof(sscanf (line, "%s", pdb_filename), "read_conformation_file");
   name_length+=strlen(pdb_filename)+1;
   if(name_length)nstructs++;
   if(!my_nat)
   {
    my_nat=read_CAs(pdb_filename,*coords, 1,0);
    *nat=my_nat;
   }
  }
  fprintf(stderr,"%d structs in list and %d chars in names %d CA atoms in decoys\n",nstructs,name_length,my_nat);
 if(nstructs)
 {
  //allocate memory
  if(!(my_names=(char*)malloc(name_length*sizeof(char))))exit(FALSE);
  if(!(my_offsets=(int*)malloc((nstructs+1)*sizeof(int))))exit(FALSE);
  if(!(*coords=(float*)malloc(nstructs*3*my_nat*sizeof(float))))exit(FALSE);
  pdb_size=3*my_nat;
 } 
 else return(0);
 rewind(list_fp);
 current_offset=0;
 while (fgets(line, LINE_LENGTH, list_fp))
 {
  check_eof(sscanf (line, "%s", pdb_filename), "read_conformation_file");
  name_length=strlen(pdb_filename)+1;
  if(name_length)
  {
   my_offsets[m]=current_offset;
   strncpy(&(my_names[current_offset]),pdb_filename,name_length);
   read_CAs(pdb_filename,&((*coords)[m*pdb_size]),0,1);
   m++;
   current_offset+=name_length;
   if(m%100==0)fprintf(stderr,"%d files read in\n",m);
  }
 }
 my_offsets[m]=current_offset;
 *name_offsets=my_offsets;
 *names=my_names;
 close_file(&list_fp,filename,"read_list_of_structures");
 return(nstructs);
}
int read_CAs(char *filename, float *coords, int count_only,int center_coords)
{
 FILE *conformation_fp;
 char line[LINE_LENGTH],r_name[4],a_name[5];
 float a_x, a_y, a_z;
 int atom_id,res_id,m=0;
 float sum_x=0,sum_y=0,sum_z=0;
 open_file(&conformation_fp, filename, "r", 0);
 while (fgets(line, LINE_LENGTH, conformation_fp))
  if (strncmp(line, "ATOM", 4) == 0)
  {
   int i,j;
   char temp_str[LINE_LENGTH];

   j=0;
   for (i=12;i<=15;i++)
   temp_str[j++]=line[i];
   temp_str[j]='\0';
   check_eof(sscanf (temp_str, "%s", a_name), "read_conformation_file");
   if (strcmp(a_name,"CA")==0)
   {
    if(!count_only)
    {
     j=0;
     for (i=30;i<=37;i++)
      temp_str[j++]=line[i];
     temp_str[j]='\0';
     check_eof(sscanf (temp_str, "%f", &a_x), "read_conformation_file");

     j=0;
     for (i=38;i<=45;i++)
      temp_str[j++]=line[i];
     temp_str[j]='\0';
     check_eof(sscanf (temp_str, "%f", &a_y), "read_conformation_file");
     j=0;
     for (i=46;i<=53;i++)
     temp_str[j++]=line[i];
     temp_str[j]='\0';
     check_eof(sscanf (temp_str, "%f", &a_z), "read_conformation_file");
     coords[m++]=a_x;
     coords[m++]=a_y;
     coords[m++]=a_z;
     if(center_coords)
     {
      sum_x+=a_x;
      sum_y+=a_y;
      sum_z+=a_z;
     }
    }
    else m+=3; 
   }
  }
 if(center_coords)
 {
  float nat=(float)(m/3);
  int n=0;
  sum_x/=nat;
  sum_y/=nat;
  sum_z/=nat;
  for(int i=0;i<nat;i++)
  {
   coords[n++]-=sum_x;
   coords[n++]-=sum_y;
   coords[n++]-=sum_z;
  }
 }
 close_file(&conformation_fp, filename, 0);
 return(m/3);
}
int read_source_file(char **array,char *filename,char *defines_string)
{
 FILE *fp;
 int read,size,n=0;
 char *my_array=0;
 if(defines_string)n=strlen(defines_string)+1;
 open_file(&fp, filename, "r", "read_source_file");
 fseek (fp , 0 , SEEK_END);
 size = ftell (fp);
 rewind (fp);
 if(!(my_array=(char*)malloc(sizeof(char)*(size+n))))exit(FALSE);
 strcpy(my_array,defines_string); 
 if(size)
 {
  read=fread(&(my_array[n-1]),sizeof(char),size,fp);
  close_file(&fp, filename, "read_source_file");
  my_array[size+n-1]='\0';
  *array=my_array;
  return(read);
 }
 return(0);
}

int calculate_number_of_frames(int nat)
{ 
 int nframes=0,len=nat;
 int L_ini_min=4;
 int divisor=1;
 if(nat <3)
 {
  fprintf(stderr,"need at least 3 atoms for alignment\n"); 
  exit(FALSE);
 }
 while(len > L_ini_min && divisor <=16 )
 {
  nframes+=nat-len+1;
  divisor*=2;
  len=nat/divisor;
 }
 nframes+=nat-L_ini_min+1;
 fprintf(stderr,"nat %d seeds %d\n",nat,nframes);
 return(nframes);
}

int convert_coords_to_float4 (int nstructs,int pdb_size, float *coords, float4 *coords4)
{
 //just rearrange and pad with zeros to 4;
 float my_coords[12];
 int p,j,k=0,natoms,m=0,mod4=0;
 natoms=pdb_size/3;
 for (p=0;p<nstructs;p++)
 {
  for(j=0;j<natoms/4;j++)
  {
   coords4[m].x=coords[k++];
   coords4[m+1].x=coords[k++];
   coords4[m+2].x=coords[k++];
   coords4[m].y=coords[k++];
   coords4[m+1].y=coords[k++];
   coords4[m+2].y=coords[k++];
   coords4[m].z=coords[k++];
   coords4[m+1].z=coords[k++];
   coords4[m+2].z=coords[k++];
   coords4[m].w=coords[k++];
   coords4[m+1].w=coords[k++];
   coords4[m+2].w=coords[k++];
   m+=3;
  }
  if((mod4=(natoms%4))) //now pad with zeros
  {
   int n,q=0;
   for (n=0;n<12;n++)
    my_coords[n]=0;
   for (n=0;n<mod4*3;n++)
    my_coords[n]=coords[k++]; 
   q=0;
   coords4[m].x=my_coords[q++];
   coords4[m+1].x=my_coords[q++];
   coords4[m+2].x=my_coords[q++];
   coords4[m].y=my_coords[q++];
   coords4[m+1].y=my_coords[q++];
   coords4[m+2].y=my_coords[q++];
   coords4[m].z=my_coords[q++];
   coords4[m+1].z=my_coords[q++];
   coords4[m+2].z=my_coords[q++];
   coords4[m].w=my_coords[q++];
   coords4[m+1].w=my_coords[q++];
   coords4[m+2].w=my_coords[q++];
   m+=3;
  }
 }
 return(m);
}

int define_decoy_sizes_string (char **defines_string, int nthreads, int pdb4_size)
{
 char buffer[1000];
 int n;
 n=sprintf(buffer,"#define NTHREADS %d\n#define PDB4SIZE %d\n",nthreads,pdb4_size);
 if(*defines_string)free(*defines_string);
 if(!(*defines_string=(char*)malloc(sizeof(char)*(n+1))))exit(FALSE); 
 strcpy(*defines_string,buffer);
 return(n);
}

float tmscore_cpu(int nat,float *coords1,float *coords2, float bR[3][3], float bt[3])
{
 int *ialign=0,nalign;
 float max_score=-1,rms;
 double R[3][3],newR[3][3],t[3];
 double *acoords=0,*dcoords1=0,*dcoords2=0;
 if(!(ialign=(int*)malloc(sizeof(int)*nat)))exit(FALSE); 
 if(!(acoords=(double*)malloc(sizeof(double)*6*nat)))exit(FALSE); 
 if(!(dcoords1=(double*)malloc(sizeof(double)*3*nat)))exit(FALSE); 
 if(!(dcoords2=(double*)malloc(sizeof(double)*3*nat)))exit(FALSE); 
 for(int i=0;i<nat*3;i++)
 {
  dcoords1[i]=coords1[i];
  dcoords2[i]=coords2[i];
 }
  
 //d0
 float d0=1.24*pow((nat-15),(1.0/3.0))-1.8;
      if(d0< 0.5)d0=0.5;
 //d0_search ----->
 float d,d0_search=d0;
      if(d0_search > 8)d0_search=8;
      if(d0_search <4.5)d0_search=4.5;
 //iterative parameters ----->

 int n_it=20;      //maximum number of iterations
 int n_init_max=6; //maximum number of L_init
 int n_init=0;
 int L_ini_min=4;
 int L_ini[6];

 if(nat < 4) L_ini_min=nat;
 int len=nat;
 int divisor=1;
 while(len > L_ini_min && n_init <5)
 {
  L_ini[n_init++]=len;
  divisor*=2;
  len=nat/divisor;
 }
 L_ini[n_init++]=4;
 if (L_ini[n_init-1] > L_ini_min)L_ini[n_init++]=L_ini_min;;

 // find the maximum score starting from local structures superposition
 float score; //TM-score
 for (int seed=0;seed<n_init;seed++)
 {
  //find the initial rotation matrix using the initial seed residues
  int L_init=L_ini[seed];
  for(int istart=0;istart<=nat-L_init;istart++)
  {
   int nchanges=1;
   int nalign=L_init;
   {
    int m=0;
    int n=0;
    for(int i=0;i<nat;i++)
    {
     if(i>=istart && i<istart+L_init)
     {
      ialign[n++]=i;
      acoords[m++]=dcoords1[3*i];
      acoords[m++]=dcoords1[3*i+1];
      acoords[m++]=dcoords1[3*i+2];
      acoords[m++]=dcoords2[3*i];
      acoords[m++]=dcoords2[3*i+1];
      acoords[m++]=dcoords2[3*i+2]; 
     }
    }
    
   }
   u3b_opt2_d(nalign,acoords,R, t);
   score_fun_dcoords(nat, d0, d0_search-1,R,t,dcoords1,dcoords2,acoords,ialign,&nalign,&score);

   d=d0_search+1;
   if(score > max_score)
   {
    max_score=score;
    for(int j=0;j<3;j++)
    {
     bt[j]=t[j];
     for(int k=0;k<3;k++)
      bR[j][k]=R[j][k];
    }
   }    
   //extend search from seed
   for (int iter=0;iter<n_it && nchanges;iter++)
   {
    u3b_opt2_d(nalign,acoords,R, t);
    nchanges=score_fun_dcoords(nat, d0,d,R,t,dcoords1,dcoords2,acoords,ialign,&nalign,&score);

    if(score > max_score)
    {
     max_score=score;
     for(int j=0;j<3;j++)
     {
      bt[j]=t[j];
      for(int k=0;k<3;k++)
       bR[j][k]=R[j][k];
     }
    }
   }
  }
 }
 if(ialign)free(ialign);
 if(acoords)free(acoords);
 if(dcoords1)free(dcoords1);
 if(dcoords2)free(dcoords2);
 return(max_score);
}
double u3b_opt2_d(int nat,double *my_coords,double u[3][3], double t[3])
{
 double sqrt3=1.73205080756888, tol= .01;
 int ip[9]={1, 2, 4, 2, 3, 5, 4, 5, 6}, ip2312[4]={2, 3, 1, 2};
 double wc=0,rms=0;
 double e0=0,d,rr[6], ss[6],a[3][3], b[3][3], e[3],r[3][3];
 double spur, det, cof, h, g, cth, sth, sqrth, p, sigma;
 double s1x=0,s1y=0,s1z=0,s2x=0,s2y=0,s2z=0;
 double sxx=0,sxy=0,sxz=0,syx=0,syy=0,syz=0,szx=0,szy=0,szz=0;

 for(int i=0;i<3;i++)
 {
   t[i]=0;
   for(int j=0;j<3;j++)
   {
    r[i][j]=0;u[i][j]=0;a[i][j]=0;
    if(i==j)
    {
     u[i][j]=1;a[i][j]=1;
    } 
   }
 }
 {
  int m=0;
  while(m<6*nat)
  {
   int n=0;
   double c1x=my_coords[m++];
   double c1y=my_coords[m++];
   double c1z=my_coords[m++];
   double c2x=my_coords[m++];
   double c2y=my_coords[m++];
   double c2z=my_coords[m++];
   s1x+=c1x;s1y+=c1y;s1z+=c1z;s2x+=c2x;s2y+=c2y;s2z+=c2z;
   sxx+=c1x*c2x; sxy+=c1x*c2y; sxz+=c1x*c2z; syx+=c1y*c2x; syy+=c1y*c2y; syz+=c1y*c2z;szx+=c1z*c2x; szy+=c1z*c2y; szz+=c1z*c2z;
  }
 }
 double fnat=(double) nat;

 r[0][0]=sxx-s1x*s2x/fnat;
 r[0][1]=sxy-s1x*s2y/fnat;
 r[0][2]=sxz-s1x*s2z/fnat;
 r[1][0]=syx-s1y*s2x/fnat;
 r[1][1]=syy-s1y*s2y/fnat;
 r[1][2]=syz-s1y*s2z/fnat;
 r[2][0]=szx-s1z*s2x/fnat;
 r[2][1]=szy-s1z*s2y/fnat;
 r[2][2]=szz-s1z*s2z/fnat;
 det= r[0][0] * ( (r[1][1]*r[2][2]) - (r[1][2]*r[2][1]) )- r[0][1] * ( (r[1][0]*r[2][2]) - (r[1][2]*r[2][0]) ) + r[0][2] * ( (r[1][0]*r[2][1]) - (r[1][1]*r[2][0]) );
 sigma=det;
 //lower triangular matrix rr
 {
  int m=0;
  for(int i=0;i<3;i++)
   for(int j=0;j<=i;j++)
    rr[m++]= r[i][0]*r[j][0]+ r[i][1]*r[j][1]+ r[i][2]*r[j][2];
 }      
 spur=(rr[0]+rr[2]+rr[5]) / 3.0; //average of diagonal sum
 cof=(((((rr[2]*rr[5] - rr[4]*rr[4]) + rr[0]*rr[5])- rr[3]*rr[3]) + rr[0]*rr[2]) - rr[1]*rr[1]) / 3.0;
 for(int i=0;i<3;i++)
  e[i]=spur;  
 h=( spur > 0 )? spur*spur-cof : -1;
 if(h>0)
 {
  det*=det;
  g = (spur*cof - det)/2.0 - spur*h;
  sqrth = sqrt(h);
  d = h*h*h - g*g;
  d= ( d<0 ) ? atan2(0,-g) / 3.0 : atan2(sqrt(d),-g)/3.0;
  cth = sqrth * cos(d);
  sth = sqrth*sqrt3*sin(d);
  e[0] = (spur + cth) + cth;
  e[1] = (spur - cth) + sth;
  e[2] = (spur - cth) - sth;
 }
 
 for(int i=0;i<3;i++)
  e[i]=(e[i] < 0) ? 0 : sqrt(e[i]);
 d=e[2];
 if(sigma < 0) d=-d;
 d+=e[1] + e[0];
 //translation for 1 to 2;
 //calculate R vectors - d is the ev;
 rmatrix_d(d,r,u);
 t[0] =  s2x/fnat - (u[0][0]*s1x/fnat + u[1][0]*s1y/fnat + u[2][0]*s1z/fnat);
 t[1] =  s2y/fnat - (u[0][1]*s1x/fnat + u[1][1]*s1y/fnat + u[2][1]*s1z/fnat);
 t[2] =  s2z/fnat - (u[0][2]*s1x/fnat + u[1][2]*s1y/fnat + u[2][2]*s1z/fnat);
 return(rms);
}

void rmatrix_d(double ev,double r[3][3],double u[3][3])
{   
 //calculate rotation matrix
 
 double a00=(r[0][0]+r[1][1]+r[2][2]);
 double a01=(r[1][2]-r[2][1]);
 double a02=(r[2][0]-r[0][2]);
 double a03=(r[0][1]-r[1][0]);
 double a11=(r[0][0]-r[1][1]-r[2][2]);
 double a12=(r[0][1]+r[1][0]);
 double a13=(r[2][0]+r[0][2]);
 double a22=(-r[0][0]+r[1][1]-r[2][2]);
 double a23=(r[1][2]+r[2][1]);
 double a33=(-r[0][0]-r[1][1]+r[2][2]);

 //from Theobald
 a00-=ev;a11-=ev;a22-=ev;a33-=ev; 
 double a2233_3223 = a22 * a33 - a23 * a23; 
 double a1233_3123 = a12 * a33-a13*a23;
 double a1223_3122 = a12 * a23 - a13 * a22; 
 double a0232_3022 = a02 * a23-a03*a22;
 double a0233_3023 = a02 * a33 - a03 * a23;
 double a0231_3021 = a02 * a13-a03*a12;

 double q[4]={a11*a2233_3223-a12*a1233_3123+a13*a1223_3122, -a01*a2233_3223+a12*a0233_3023-a13*a0232_3022,a01*a1233_3123-a11*a0233_3023+a13*a0231_3021,-a01*a1223_3122+a11*a0232_3022-a12*a0231_3021};
 
 double len2q=q[0]*q[0]+q[1]*q[1]+q[2]*q[2]+q[3]*q[3];
 if(!len2q)len2q=DBL_MIN;
 double aj=q[0]*q[0]/len2q;
 double xj=q[1]*q[1]/len2q;
 double yj=q[2]*q[2]/len2q;
 double zj=q[3]*q[3]/len2q;
 double  xy = q[1] * q[2]/len2q;
 double  az = q[0] * q[3]/len2q;
 double  zx = q[3] * q[1]/len2q;
 double  ay = q[0] * q[2]/len2q;
 double  yz = q[2] * q[3]/len2q;
 double  ax = q[0] * q[1]/len2q; 

 u[0][0]= aj + xj - yj - zj; u[0][1]= 2.0f * (xy + az); u[0][2]= 2.0f * (zx - ay); 
 u[1][0]= 2.0f * (xy - az);  u[1][1]=aj - xj + yj - zj; u[1][2]= 2.0f * (yz + ax); 
 u[2][0]= 2.0f * (zx + ay),  u[2][1]= 2.0f * (yz - ax); u[2][2]= aj - xj - yj + zj;
} 
int score_fun_dcoords(int nat, float d0, float d, double R[3][3], double t[3],double *coords1, double *coords2,double *acoords,int *ialign,int *nalign,float *tm_score)
{
 //ialign points to atom number
 int k,ncut=0,nchange=0,my_nalign=*nalign;
 double d2,dist;
 double *my_dist=0,my_score=0;
 if(!(my_dist=(double*)malloc(nat*sizeof(double))))exit(FALSE);
 //keep nmin smallest distances && distances < dtmp
 for(k=0;k<nat;k++)
 {
  double u[3];
  int m=3*k;
  u[0]=t[0]+R[0][0]*coords1[m]+R[1][0]*coords1[m+1]+R[2][0]*coords1[m+2]-coords2[m];
  u[1]=t[1]+R[0][1]*coords1[m]+R[1][1]*coords1[m+1]+R[2][1]*coords1[m+2]-coords2[m+1];
  u[2]=t[2]+R[0][2]*coords1[m]+R[1][2]*coords1[m+1]+R[2][2]*coords1[m+2]-coords2[m+2];
  dist=u[0]*u[0]+u[1]*u[1]+u[2]*u[2];
  my_score+=1.0/(1.0+dist/d0/d0);
  my_dist[k]=dist;
 }
 //adjust d until there are at least 3 the same
 while(ncut <3)
 {
  d2=d*d;
  ncut=0;
  for(k=0;k<nat;k++)
   if(my_dist[k]<d2) ncut++;
  d+=.5;
 }
 ncut=0;
 for(k=0;k<nat;k++)
  if(my_dist[k]<d2)
  {  
   if(ncut < my_nalign && ialign[ncut] == k)ncut++;
   else
   {
    nchange=1;
    ialign[ncut++]=k;
   }
  }
 if(my_dist)free(my_dist);
 *tm_score=my_score/(double)nat;
 if(!nchange)return(0);
 int m=0;
 for(k=0;k<ncut;k++)
 {
  int n=ialign[k];
  acoords[m++]=coords1[3*n];
  acoords[m++]=coords1[3*n+1];
  acoords[m++]=coords1[3*n+2];
  acoords[m++]=coords2[3*n];
  acoords[m++]=coords2[3*n+1];
  acoords[m++]=coords2[3*n+2];
 }
 *nalign=ncut;
 return(1);
}
