INFILE=2019_03_31_WordcloudResearch.txt OUTFILE=2019_03_31_WordcloudResearch_edited.txt TMPFILE=tmp_x_file.txt # Remove space, line breaks, kommas cat $INFILE | sed 's/-/ /' > $OUTFILE cp $OUTFILE $TMPFILE cat $TMPFILE | sed 's/^[ \t]*//;s/[ \t]*$//' > $OUTFILE cp $OUTFILE $TMPFILE cat $TMPFILE | tr '\n' ' ' > $OUTFILE cp $OUTFILE $TMPFILE cat $TMPFILE | sed -e "s/[^ a-zA-Z']//g" -e 's/ \+/ /' > $OUTFILE cp $OUTFILE $TMPFILE cat $TMPFILE | sed 's/^[ \t]*//;s/[ \t]*$//' > $OUTFILE cp $OUTFILE $TMPFILE cat $TMPFILE | sed 's/\,//' > $OUTFILE cp $OUTFILE $TMPFILE # convert everything to lower case cat $TMPFILE | sed -e 's/\(.*\)/\L\1/' > $OUTFILE cp $OUTFILE $TMPFILE # words to replace replaceWords="crystalline-crystal polariz-polaris geometric-geometry mesophases-mesophase tomographic-tomography nerk-network selfassemb-self.assemb structures-structure cells-cell gyroids-gyroid packings-packing surfaces-surface materials-material geometries-geometry material-materials phases-phase models-model poissons-poisson nanostructures-nanostructure anisotropic-anisotropy networks-network tessellations-tesselation patterns-pattern tessela-tessella images-image particles-particle solids-solid ellipsoids-ellipsoid morphologies-morphology algorithms-algorithm spaces-space copolymers-copolymer phases-phase photonics-photonic fluids-fluid theoretical-theory characteriz-characteris shapes-shape callophrys-c rubii-rubi designs-design disordered-disorder ordered-order pore-porous grains-granular grain-granular" for replaceword in $(echo $replaceWords) do w2=$(echo $replaceword | sed 's/.*-//') w1=$(echo $replaceword | sed 's/-.*//') echo "Replace word : " $w1 "by" $w2 cat $TMPFILE | sed "s/ $w1/ $w2/g" > $OUTFILE cp $OUTFILE $TMPFILE done # composite words compositeWords="liquid-crystal monte-carlo molecular-dynam phase-diagram cubic-phase functional-mater circular-dichroism minkowski-tensor structure-metric minimal-surface medial-axis medial-surface circular-polaris integral-geomet poisson-ratio group-theor wing-scal self-assembl minkowski-function band-struct laser-writing effective-prop boolean-mod c-rubi triply-period" for compword in $(echo $compositeWords) do w2=$(echo $compword | sed 's/.*-//') w1=$(echo $compword | sed 's/-.*//') echo "Composite word : " $w1 "." $w2 twowords=$w1" "$w2 oneword=$w1"."$w2 cat $TMPFILE | sed "s/$twowords/$oneword/g" > $OUTFILE cp $OUTFILE $TMPFILE done # List of words that should not be considered for word cloud. These # should be fill words, general words that don't relate to specific # field, addresses, and those sorts of things. These were selected by # looking at the resulting wordcloud and specifying those words that # are not relevant. doNotConsider="also simple system sind one two three four five six well system area south doi many rights show block present based global local length within found different properties obtained mean used result show based systems point data results find group given can set light using set experimental parameters provide potential findings defined writing single novel large modes state first function values size data size conventional new use ratios high use analysis demonstrate approach however average charcterization derived investigate properties direct close method methods process processes field fraction solution negative direction free particular conference meeting indices intercept diagram threshold functionals similar compared standard system systems processes change however response responses studies mecke study furthermore mode describe discuss science definition general including allows directly probably behavior behaviour example examples construction strong shows research sensitive ground problem fractions fraction explicit left right often strong paper quantify provides provide forms form element regions band number effective physical configurations characterization important case small diametre due useful unique double reveals reveal fully derive whose block limit tonight range yet deneric occurs each may likely specific dense reveals presence class may way rates better central several value lower differences the even studied second terms positive shown purpose constant along pair yield triply called vertical type derive future known called lack various recent lines thus among consider reported socalled comparison among with highly this the information concept conclusion best better worse beyond degree etc via rather society least confirm systematic control higher shyde hyde pansu owing " # removing words that are not relevant for word in $(echo $doNotConsider) do echo $word cat $TMPFILE | sed "s/ $word / /g" > $OUTFILE cp $OUTFILE $TMPFILE done