; $Id$
;###############################################################################
; LICENSE:
;  The software in this file is written by an employee of 
;  National Institute of Standards and Technology 
;  as part of the DAVE software project.
;
;  The DAVE software package is not subject to copyright protection
;  and is in the public domain. It should be considered as an
;  experimental neutron scattering data reduction, visualization, and
;  analysis system. As such, the authors assume no responsibility
;  whatsoever for its use, and make no guarantees, expressed or
;  implied, about its quality, reliability, or any other
;  characteristic. The use of certain trade names or commercial
;  products does not imply any endorsement of a particular product,
;  nor does it imply that the named product is necessarily the best
;  product for the stated purpose. We would appreciate acknowledgment
;  if the DAVE software is used of if the code in this file is
;  included in another product.
;
;###############################################################################
; Written by J.R.D. Copley.
;************************************************************************************************
pro parse_formula,formula=formula,natoms,isotope
;************************************************************************************************
; This routine breaks a simple chemical formula into its components. The formula contains no
; parentheses. Isotopes are represented using the '\' character, e.g. '\13C'.
; Examples of formulas:.
;	formula='\12C3 D8 Si2\16O6.5'
;	formula='C3NF3D8SiOs6.5'
;
compile_opt strictarr
;
;
; Remove all spaces.
formula=strcompress(formula,/remove_all)
;
; Separate out the individual components looking for u.c. letters and the '\' symbol.
where_uc,formula,start
nbits=n_elements(start)
lgths=strlen(formula)
if (nbits gt 1) then lgths=[start[1:nbits-1],lgths]-start
;
;	These are the components of the formula.
bits=strmid(formula,start,lgths)
;
natoms=fltarr(nbits)
isotope=strarr(nbits)
;
; For each component store the number of atoms and the isotope symbol.
for k=0,nbits-1 do begin
	bit=bits[k]
	start=stregex(bit,'[A-Z]+[a-z]*',length=lgth)
	massno=strmid(bit,0,start)
	;
	; If nothing precedes the element symbol prepend 'nat-'
	if (massno eq '') then massno='nat-'
	;
	; If the component is an isotope remove the '\' symbol.
	if (strmid(massno,0,1) eq '\') then massno=strmid(massno,1)
	isotope[k]=massno+strmid(bit,start,lgth)
	;
	; Treat special cases of D and T.
	if (isotope[k] eq 'nat-D') then isotope[k]='2H'
	if (isotope[k] eq 'nat-T') then isotope[k]='3H'
	natoms[k]=float(strmid(bit,start+lgth))
	;
	; Check for subscript 0 following chemical symbol (not permitted).
	if (natoms[k] le 0.0 and (strlen(bit) gt start+lgth)) then natoms[k]=-1
	;
	; If nothing follows the atom symbol assume 1 atom.
	if (natoms[k] eq 0.0) then natoms[k]=1.0
endfor
;
end


;************************************************************************************************
pro where_uc,formula,pos
;************************************************************************************************
; This routine finds the positions of u.c. and '\' characters in a string.
; For each '\' found, the position of the following u.c. character is removed from the list.
compile_opt strictarr
;
lgth=strlen(formula)
pos=indgen(lgth)
char=strmid(formula,pos,1)
;
; Find positions of u.c. and '\' characters.
match=strmatch(char,'[\\A-Z]')
pos=where(match eq 1,npos)
;
; Find positions of '\' characters.
match=strmatch(char,'\\')
pos2=where(match eq 1,nbslash)
;
; If no '\' characters found, return.
if (nbslash eq 0) then return
;
; This code removes the position of the u.c. character following each '\' character.
for k=0,nbslash-1 do begin
	loc=where(pos eq pos2[k])
	if (loc+2 gt npos-1) then begin
		pos=pos[0:loc]
	endif else begin
		pos=[pos[0:loc],pos[loc+2:npos-1]]
	endelse
endfor
end


;************************************************************************************************
function expand_formula,substr
;************************************************************************************************
; This routine takes a substring of the form '(...)' or '(...)x where ... is a chemical
; formula with no parentheses and x is a number, and multiplies it out.
; e.g. (NO3)2 becomes N2.000O6.000, (CH3) becomes CH3.
compile_opt strictarr
;
pos=strpos(substr,')')
lgth=strlen(substr)
;
; If nothing follows the ')', return with parentheses stripped out.
if (pos+1 eq lgth) then return,strmid(substr,1,lgth-2)
;
; Extract the string between parentheses and the multiplier.
formula=strmid(substr,1,pos-1)
multiplier=float(strmid(substr,pos+1))
;
; Break the formula into its components
where_uc,formula,start
nbits=n_elements(start)
lgths=strlen(formula)
if (nbits gt 1) then lgths=[start[1:nbits-1],lgths]-start
bits=strmid(formula,start,lgths)
nbits=n_elements(bits)
;
; Construct the new formula.
formula=''
for k=0,nbits-1 do begin
	bit=bits[k]
	;
	;	Separate out the isotope and the number of atoms.
	start=stregex(bit,'[\\0-9]*[A-Z]+[a-z]*',length=lgth)
	isotope=strmid(bit,start,lgth)
	natoms=float(strmid(bit,start+lgth))
	;
	; If nothing follows the atom symbol assume 1 atom.
	if (natoms eq '') then natoms=float(1)
	natoms=natoms*multiplier
	formula=formula+isotope+strcompress(string(natoms,format='(f10.3)'),/remove_all)
endfor
;
return,formula
;
end


;************************************************************************************************
pro simplify_formula,formula,natoms,isotope
;************************************************************************************************
; The purpose of this code is to take a chemical formula and return the element or isotope names
; plus the numbers of atoms of each element or isotope. There is no attempt to consolidate.
;	Thus a formula such as 'CH3COOH' returns isotope as [' nat-C',' nat-H',' nat-C',' nat-O',
;	' nat-O',' nat-H'] and natoms as [1.00000,3.00000,1.00000,1.00000,1.00000,1.00000].
; The input notation for an element is e.g. 'C' or 'Si' (case is important). For an isotope it
; is e.g. '\13C'. The elements are returned as e.g. ' nat-C' and ' nat-Si'. The isotopes are
; returned as e.g. ' 13C'. 'D' and 'T' are returned as '2H' and '3H' respectively.
;
compile_opt strictarr
;
; First we look for portions of the formula enclosed in parentheses and replace them with
; equivalent strings, e.g. (NO3)2 is returned as 'N2.00000O6.00000'.
; The process is repeated until there are no remaining parentheses.
;
; We look for a '(' followed by alphanumerics (including '.' and '\'), and then a ')' followed
; by nothing or by a number.
pattern='\({1}[A-Za-z0-9.\\]*\){1}[0-9.]*'
;
repeat begin
	pos=stregex(formula,pattern,length=l)
	firstpart=strmid(formula,0,pos)
	middlepart=strmid(formula,pos,l)
	finalpart=strmid(formula,pos+l)
	done=(middlepart eq '')
	if (not done) then middlepart=expand_formula(middlepart)
	formula=firstpart+middlepart+finalpart
endrep until (done)
;
; Having got rid of all the parentheses we break the formula into small pieces.
parse_formula,formula=formula,natoms,isotope
;
end
