Commit f7a844d2 authored by Matthias Schroeder's avatar Matthias Schroeder

lookup_formulas.c added

parent 43c4d801
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <getopt.h>
#include <math.h>
#include <ctype.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/types.h>
#define limC 3000
#define limH 30000
#define limO 400
#define limN 200
#define limS 120
#define limP 100
#define limCl 10
// Check if product of all these numbers fit in uint64
#define MAX_MASS 2000
#define MAX_RESULTS 500000
#define MAX_ELE 32
#define MAX_SEARCH 5000000
#define MAX_SPECS 2 // how many special elements allowed
#define TOLMUL 1.0E7
#define TOLDIV 10000000LL
int accepted = 0;
int skipped = 0;
int rejected[] = {0,0,0};
int tried = 0;
// List of weights to search
int64_t searchweights[MAX_SEARCH];
// Used to store results
typedef struct {
uint64_t orgsearch;
uint64_t diff;
int a[MAX_ELE];
int idx;
} Result;
Result results[MAX_RESULTS];
int last_result = 0;
// Dataformat in binary files
typedef struct {
uint64_t mass;
uint64_t formel;
} Moli;
// used to store binary file in memory
typedef struct {
int64_t last; // list length
Moli* a; // list of moli tuples
} Moli1;
Moli1* mols[MAX_MASS]; // array of lists to store mass and formula tuples
// Base definitions of elements
typedef struct {
char formel[10];
uint64_t mass;
int valence;
// int defining group
// 2: Isotope replacing other C13 <-> C
// 1: Metal - used with -m or max 2 metals
int spec0;
} Element;
// struct for elements describing loop and other props
typedef struct {
char formel[10]; // Just String for formula, C13
int cFrom; // Min number of elements in resuilt
int cTo; // Max Number in result
char* ref; // Refered Element for replacements (c13 <-> C)
int iref; // index of refered Element (calculated)
int64_t delta; // mass difference if added or replaced, so mass for added elements, C13-C for replacements
Element e; // reference to element
} Count;
Element elements[] = {
{ "C" , 12000000000 , 4, 0 }, // 0
{ "H" , 1007825000 , 1, 0 }, // 1
{ "O" , 15994914619 , 2, 0 }, // 2
{ "N" , 14003074004 , 5, 0 }, // 3
{ "S" , 31972072000 , 6, 0 }, // 4
{ "P" , 30973763000 , 5, 0 }, // 5
{ "Cl" , 34968852700 , 1, 1 }, // 6
{ "Na" , 22989769280 , 1, 1 }, // 7
{ "C13" , 13003354835 , 4, 2 }, // 8
{ "O18" , 17999161000 , 2, 2 }, // 9
{ "N15" , 15000108899 , 5, 2 }, // 10
{ "S34" , 33967866900 , 6, 2 }, // 11
{ "Cl37" , 36965902590 , 1, 1 }, // 11
{ "Co" , 58933195000 , 2, 1 }, // 12
{ "Cu.i" , 62929597720 , 1, 1 }, // 13
{ "Cu.ii" , 62929597720 , 2, 1 }, // 14
{ "Cu65.i" , 64927789500 , 1, 1 }, // 15
{ "Cu65.ii", 64927789500 , 2, 1 }, // 16
{ "Fe.ii" , 55934936000 , 2, 1 }, // 17
{ "Fe.iii", 55934936000 , 3, 1 }, // 18
{ "Fe54.ii" , 53939609000 , 2, 1 }, // 19
{ "Fe54.iii", 53939609000 , 3, 1 }, // 20
{ "Br79" , 78918338000 , 1, 1 }, // 21
{ "Br81" , 80916900000 , 1, 1 }, // 22
{ "Ni.ii" , 57935346200 , 2, 1 }, // 23
{ "Ni60.ii" , 59930786400 , 2, 1 }, // 24
{ "Zn.ii" , 65380000000 , 2, 1 }, // 25
{ "Zn66.ii" , 65926033400 , 2, 1 }, // 26
{ "I" , 126904470000 , 1, 1 } }; // 27
int NBR_ELEMENTS= sizeof(elements)/sizeof(Element);
#define COUNTS_OFFSET 7 // index of first not pregenerated element
Count molstd_counts[] = {
{ "C" , 0, 80, NULL },
{ "H" , 0, 400, NULL },
{ "O" , 0, 40, NULL },
{ "N" , 0, 30, NULL },
{ "S" , 0, 15, NULL },
{ "P" , 0, 20, NULL },
{ "Cl" , 0, 2, NULL },
};
int NBRSTD_COUNTS= sizeof(molstd_counts)/sizeof(Count);
// All the counts with min and max for the additinal elements, missing cols added in init_alternatives
Count mol_counts[] = {
{ "Na" , 0, 2, NULL },
{ "C13" , 0, 3, "C" },
{ "O18" , 0, 1, "O" },
{ "N15" , 0, 1, "N" },
{ "S34" , 0, 1, "S" },
{ "Cl37" , 0, 1, "Cl" },
{ "Co" , 0, 2, NULL },
{ "Cu.i" , 0, 2, NULL },
{ "Cu.ii" , 0, 2, NULL },
{ "Cu65.i" ,0, 2, "Cu.i" },
{ "Cu65.ii" ,0, 2, "Cu.ii" },
{ "Fe.ii" , 0, 2, NULL },
{ "Fe.iii" , 0, 2, NULL },
{ "Fe54.ii" ,0, 2, "Fe.ii" },
{ "Fe54.iii" ,0, 2, "Fe.iii" },
{ "Br79" , 0, 2, NULL },
{ "Br81" , 0, 2, NULL },
{ "Ni.ii" , 0, 2, NULL },
{ "Ni60.ii" ,0, 2, "Ni.ii" },
{ "Zn.ii" , 0, 2, NULL },
{ "Zn66.ii" ,0, 2, "Zn.ii" },
{ "I" , 0, 2, NULL },
};
int NBR_COUNTS= sizeof(mol_counts)/sizeof(Count);
int cur_mol[MAX_ELE - COUNTS_OFFSET]; // array with current element numbers
uint64_t cur_mol_nbr = 0;
int DO_DEBUG = 0;
// -------------------------------------------------------------------------
void print_table_results();
// -------------------------------------------------------------------------
//
char* aF2csvlist(char* f, int* aF) {
f[0]= '\0';
for (int i=0; i<NBR_ELEMENTS; i++) {
sprintf(f+strlen(f), "%d\t", aF[i]);
}
f[strlen(f)-1]='\0';
return f;
}
char* aF2formel(char* f, int* aF) {
f[0]= '\0';
for (int i=0; i<NBR_ELEMENTS; i++) {
if (aF[i]) {
sprintf(f+strlen(f), "%s_%d ", elements[i].formel, aF[i]);
}
}
f[strlen(f)-1]='\0';
return f;
}
void mass2str(char* s, uint64_t m) {
char r[256];
sprintf(r, "%010ld", m);
char* nk= r+(strlen(r)-9);
char* p=r;
char* q=s;
while (*p) {
*q=*p;
p++;
q++;
if (p==nk) *q++= '.';
}
*q= '\0';
}
// converts BigintWeight o rounded normal weight
int weight2int(int64_t weight) {
int64_t fi = weight/1000000000LL;
return (int)fi;
}
// Converts Database Int to Formula (int[])
void int2aF(int* aF, uint64_t iF) {
int i;
for (i=0; i < MAX_ELE; i++) {
aF[i]= 0;
}
i=0;
aF[i++]= iF % limC;
iF = iF / limC;
aF[i++]= iF % limH;
iF = iF / limH;
aF[i++]= iF % limO;
iF = iF / limO;
aF[i++]= iF % limN;
iF = iF / limN;
aF[i++]= iF % limS;
iF = iF / limS;
aF[i++]= iF % limP;
iF = iF / limP;
aF[i++]= iF % limCl;
iF = iF / limCl;
}
uint64_t mass_aF(int* aF) {
uint64_t m= 0;
for (int i = 0; i < NBR_ELEMENTS; ++i) {
if (aF[i]>0) {
m = m + aF[i]*elements[i].mass;
}
}
return m;
}
int val_aF(int* aF) {
int v= 0;
for (int i = 0; i < NBR_ELEMENTS; ++i) {
if (aF[i]>0) {
v = v + aF[i]*elements[i].valence;
}
}
return v;
}
double calc_hc(int* aF) {
return (double)aF[1] / (double)(aF[0]+aF[8]);
}
double calc_oc(int* aF) {
return (double)aF[2] / (double)(aF[0]+aF[8]);
}
int check_mol(int* aF) {
for (int i=0; i<NBR_ELEMENTS; i++) {
if (aF[i]<0) {
rejected[0] += 1;
return 0;
}
}
// check valence
if (val_aF(aF) % 2 == 1) {
rejected[1] += 1;
return 0;
}
// check number of special elements
int sum= 0;
for (int i=COUNTS_OFFSET; i<NBR_ELEMENTS; i++) {
if (elements[i].spec0==1)
sum+= aF[i];
}
if (sum>MAX_SPECS) {
rejected[2] += 1;
return 0;
}
accepted += 1;
return 1;
}
Moli1* read_binfile(char* infilename) {
FILE* fp;
int64_t vers= 0x000000000001L;
Moli1* m;
uint64_t cnt;
m= (Moli1*)malloc(sizeof(Moli1));
if( (fp=fopen(infilename,"rb")) == NULL) {
m->last= 0;
m->a= NULL;
return m;
}
cnt = fread(&(m->last), sizeof(int64_t), 1, fp);
if (cnt!=1) exit(2);
cnt = fread(&vers, sizeof(int64_t), 1, fp);
if (cnt!=1) exit(2);
m->a= (Moli*)malloc(sizeof(Moli1) * m->last);
cnt = fread(m->a, sizeof(Moli), m->last, fp);
if (cnt!=m->last) {
fprintf(stderr, "File %s, Expected %ld lines, found %ld\n",
infilename, m->last, cnt );
exit(2);
}
if (DO_DEBUG) fprintf(stderr, " Read %s, nbr of entries:%ld\n", infilename, m->last);
fclose(fp);
return m;
}
Moli1* read_filesfor(int idx, int64_t w) {
char fn[256];
int iweight;
int cnt_cache = 0;
iweight = weight2int(w);
// if (DO_DEBUG) fprintf(stderr, "read files for %ld (%d)\n", w, iweight);
if (iweight<0 || iweight>MAX_MASS) {
fprintf(stderr, "Weight %ld out of range for %d\n", w, idx);
exit(9);
}
if (mols[iweight] && mols[iweight]->last) {
return mols[iweight];
}
// Count already read lists
cnt_cache = 0;
for (int i=0; i<MAX_MASS; i++) {
if (mols[i] && mols[i]->last) {
cnt_cache += 1;
}
}
if (cnt_cache>200 && iweight>5 && iweight<MAX_MASS) {
for (int i=0; (i<iweight-5) && (cnt_cache>190); i++) {
if (mols[i] && mols[i]->last) {
if (DO_DEBUG) fprintf(stderr, " Free %4d %ld\n", i, mols[i]->last);
free(mols[i]->a);
free(mols[i]);
mols[i]= NULL;
cnt_cache -= 1;
}
}
}
if (iweight>0 && iweight<MAX_MASS) {
sprintf(fn, "weights/w%04d.bin", iweight);
if (!mols[iweight]) {
mols[iweight]= read_binfile(fn);
}
}
return mols[iweight];
}
int push_result(int idx, uint64_t searchmass, uint64_t mass, int* aF) {
if (last_result>=MAX_RESULTS) {
print_table_results();
}
results[last_result].idx = idx;
results[last_result].orgsearch = searchmass;
if (mass>searchmass)
results[last_result].diff = mass-searchmass;
else
results[last_result].diff = searchmass-mass;
if (results[last_result].diff > 10000000) {
fprintf(stderr, "Diff zu gross %ld (%ld, %ld)\n", results[last_result].diff, mass, searchmass);
exit(9);
}
for (int i=0; i<NBR_ELEMENTS; i++)
results[last_result].a[i]= aF[i];
last_result+=1;
char formel[256];
}
// returns index of first element in list m with mass >= target
uint64_t search_start_in_list(uint64_t target, Moli1* m, uint64_t first, uint64_t last) {
int test;
if (first+1 >= last) return first;
if (target > m->a[first].mass) {
test = (first+last) / 2;
if (target >= m->a[test].mass) {
return search_start_in_list(target, m, test, last);
} else {
return search_start_in_list(target, m, first, test);
}
} else {
if (first==0 || target < m->a[first-1].mass) return first;
}
}
void search_weight_db(int idx,
uint64_t orgsearch, uint64_t searchweight,
uint64_t tolerance) {
int aF[MAX_ELE];
int valid;
uint64_t i_mass;
uint64_t tol = orgsearch * tolerance / TOLDIV;
Moli1* m = read_filesfor(idx, searchweight);
if (!m) {
fprintf(stderr, "No list found for %ld (%d)\n", searchweight, weight2int(searchweight));
return;
}
// search for first element larger org-tol
uint64_t target = searchweight - tol;
uint64_t start = search_start_in_list(target, m, 0, m->last);
// printf("search %ld - %ld\n", searchweight - tol, searchweight + tol);
// run
for (uint64_t i=start; i< m->last && m->a[i].mass < searchweight+tol; i++) {
if (labs(m->a[i].mass - searchweight) < tol) {
tried += 1;
int2aF(aF, m->a[i].formel);
// Check mol with current limits
valid = 1;
for (int j=0; j<NBRSTD_COUNTS && valid; j++) {
if ((aF[j]<molstd_counts[j].cFrom) || (aF[j]>molstd_counts[j].cTo)) {
// printf("skipped at j=%d, aF=%d\n", j, aF[j]);
valid = 0;
skipped += 1;
// activate only for detailed debugs, lots of output!!!!
if (0 && DO_DEBUG) {
char fsum[256];
aF2formel(fsum, aF);
fprintf(stderr, "skip(at %s(j=%d), #=%d[%d,%d]) %s\n",
molstd_counts[j].formel, j, aF[j], molstd_counts[j].cFrom,molstd_counts[j].cTo, fsum);
}
}
}
if (valid) {
for (int j=0; j<NBR_COUNTS; j++) {
if (cur_mol[j] && mol_counts[j].ref) {
aF[mol_counts[j].iref] -= cur_mol[j];
}
aF[j+COUNTS_OFFSET]= cur_mol[j];
}
if (check_mol(aF)) {
i_mass= mass_aF(aF);
if (labs(i_mass-orgsearch)> 11000000) {
fprintf(stderr, "Diff zwischen %ld %ld (sw=%ld, d=%ld (%lld))\n", i_mass, orgsearch, searchweight, i_mass-orgsearch, searchweight/1000000000LL);
} else {
push_result(idx, orgsearch, i_mass, aF);
}
}
}
}
}
}
char* specs2str(char* s) {
s[0] = '\0';
for (int i = 0; i < NBR_COUNTS; ++i) {
if (cur_mol[i]>0)
sprintf(s+strlen(s), "%s_%d ", mol_counts[i].e.formel, cur_mol[i]);
}
s[strlen(s)-1]='\0';
return s;
}
int count_specs() {
int nbr = 0;
for (int i = 0; i < NBR_COUNTS; ++i) {
if (mol_counts[i].e.spec0 == 1) {
nbr+= cur_mol[i];
}
}
return nbr;
}
uint64_t mass_specs() {
uint64_t m = 0;
for (int i = 0; i < NBR_COUNTS; ++i) {
m += cur_mol[i] * mol_counts[i].delta;
}
return m;
}
int next_specs() {
int nbr = count_specs();
for (int i = 0; i < NBR_COUNTS; ++i) {
if (cur_mol[i] < mol_counts[i].cTo &&
(nbr < MAX_SPECS ||
(nbr==MAX_SPECS && mol_counts[i].e.spec0 != 1))) {
cur_mol[i] += 1;
cur_mol_nbr+= 1;
return 1;
} else {
cur_mol[i] = mol_counts[i].cFrom;
nbr = count_specs();
}
}
return 0;
}
void init_specs() {
for (int i = 0; i < NBR_COUNTS; ++i) {
cur_mol[i]= mol_counts[i].cFrom;
}
cur_mol_nbr = 0;
}
void search_weight(int idx,
uint64_t searchweight,
uint64_t tolerance) {
char s[256];
uint64_t spec_mass;
init_specs();
search_weight_db(idx, searchweight, searchweight, tolerance);
while (next_specs()) {
spec_mass = mass_specs();
// Minimale mass to look for: 12
if (spec_mass+elements[0].mass<searchweight) {
specs2str(s);
// printf("try %s\n", s);
search_weight_db(idx, searchweight, searchweight-spec_mass, tolerance);
}
}
}
void print_table_header() {
char molsnames[256];
molsnames[0]=0;
for (int i=0; i<NBR_ELEMENTS; i++)
sprintf(molsnames+strlen(molsnames),"%s\t", elements[i].formel);
*(molsnames+strlen(molsnames)-1)= '\0';
printf("ID\tvalueData\tvalueMaster\tdiff \tH.C \tO.C \t%s\tformula\n",
molsnames);
}
void print_table_results() {
uint64_t i_mass;
uint64_t last_s= 0;
int id= 1;
// last_result= 0; return;
char formel[256], slist[256], mass[128], search[128], str[256];
// if (DO_DEBUG) fprintf(stderr, "Start print_results %d\n", last_result);
// we do nit give idx of hits instead idx of search list
// qsort(results, last_result, sizeof(Result), compareResults);
// if (DO_DEBUG) fprintf(stderr, " sorted\n");
for (int i=0; i<last_result; i++) {
i_mass = mass_aF(results[i].a);
mass2str(search, results[i].orgsearch);
mass2str(mass, i_mass);
aF2formel(formel, results[i].a);
aF2csvlist(slist, results[i].a);
sprintf (str,"%d\t%s\t%s\t%f\t%f\t%f\t%s\t%s",
results[i].idx,
search, mass,
(double)labs(i_mass-results[i].orgsearch)/(double)results[i].orgsearch*TOLMUL, //DIFF
calc_hc(results[i].a), calc_oc(results[i].a),
slist, formel);
puts(str);
}
last_result= 0;
}
void init_all() {
for (int i=0; i<MAX_MASS; i++) {
mols[i] = NULL;
}
// initialize start mol and copy elements
for (int i = 0; i < NBR_COUNTS; ++i) {
cur_mol[i]= mol_counts[i].cFrom; // init loop
for (int j = 0; j < NBR_ELEMENTS; ++j) {
// Fill missing values in mol_counts
if (strcmp(mol_counts[i].formel, elements[j].formel) == 0) {
memcpy(&(mol_counts[i].e), &(elements[j]), sizeof(Element));
mol_counts[i].e= elements[j];
mol_counts[i].delta= mol_counts[i].e.mass;
break;
}
}
if (mol_counts[i].ref) {
// Still fill missing values in mol_counts, in case of ref modify delta and enter index of refed element
for (int j = 0; j < NBR_ELEMENTS; ++j) {
if (strcmp(mol_counts[i].ref, elements[j].formel) == 0) {
mol_counts[i].delta= mol_counts[i].e.mass - elements[j].mass;
mol_counts[i].iref = j;
}
}
}
}
}
void metal_counts_to_zero() {
for (int i=0; i<NBR_COUNTS; i++) {
if (mol_counts[i].e.spec0 == 1) {
mol_counts[i].cTo=0;
if (DO_DEBUG) {
fprintf(stderr, "set %d %s to zero\n", i, mol_counts[i].formel);
}
}
}
}
int read_searchweights(char* fname) {
int idx=0;
double sv;
FILE* infh = fopen(fname, "r");
if (!infh) {
printf("File %s nicht zu oeffnen", fname);
exit(1);
}
while (fscanf(infh, "%lf", &sv)==1) {
if (sv>0 && sv<MAX_MASS) {
searchweights[idx++]= (uint64_t)(sv * 1E9);
} else {
fprintf(stderr, "Ungueltige Masse Zeile %d", idx);
exit(2);
}
if (idx>=MAX_SEARCH) {
fprintf(stderr, "Too many Masses %d", idx);
exit(2);
}
}
searchweights[idx]=0;
if (DO_DEBUG) {
fprintf(stderr, "READ %d SEARCHES\n", idx);
}
return idx;
}