Changeset 3481
- Timestamp:
- 09/08/10 10:20:57 (17 months ago)
- Location:
- biobank_search/handwritten/java/plugins
- Files:
-
- 5 edited
-
Annotator.java (modified) (1 diff)
-
DBIndexPlugin.java (modified) (3 diffs)
-
OntoCatIndexPlugin2.java (modified) (9 diffs)
-
OntocatQueryExpansion_lucene.java (modified) (14 diffs)
-
OntologyIndexerAndSearcher.java (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
-
biobank_search/handwritten/java/plugins/Annotator.java
r3413 r3481 11 11 import uk.ac.ebi.ontocat.OntologyTerm; 12 12 13 /* 13 /** 14 14 * Annotates the input text: searches words and phrases in ontologies and adds XML tags to found terms 15 15 */ -
biobank_search/handwritten/java/plugins/DBIndexPlugin.java
r3425 r3481 60 60 61 61 62 static final String LUCENE_INDEX_DIRECTORY = "/Users/ jorislops/Documents/workspace/Dasha/molgenis_db_index";62 static final String LUCENE_INDEX_DIRECTORY = "/Users/despoina/Documents/biobank_search_index/molgenis_db_index"; 63 63 static final String PROJECTNAME = "biobank_search"; //TODO : This should be retrieved, not hardcoded. 64 64 List<String> OntologiesForExpansion = null; … … 109 109 if (ontologies.isEmpty()){ 110 110 System.out.println("[Ontologies] is empty"); 111 this.setStatus("<h4> choose the ontologies to use for query expansion</h4>");111 this.setStatus("<h4>Choose the ontologies to use for query expansion</h4>"); 112 112 } 113 113 114 114 setOntologiesForExpansion(ontologies); 115 System.out.println(" ontologies : " + ontologies);115 System.out.println("Ontologies : " + ontologies); 116 116 117 117 … … 345 345 346 346 347 /** 348 * The function for query expansion. 349 * Creates a new (empty) instance of OntocatQueryExpansion_lucene class 350 * @param db 351 */ 347 352 public void ExpandQuery(Database db){ 348 353 -
biobank_search/handwritten/java/plugins/OntoCatIndexPlugin2.java
r3425 r3481 41 41 42 42 43 /* 44 * Indexes all ontologies specified in ontologyNamesMap. The ontologies should be downloaded on the computer in ONTOLOGIES_DIRECTORY 43 /** 44 * Indexes all ontologies specified in ontologyNamesMap. 45 * The ontologies should be downloaded on the computer in ONTOLOGIES_DIRECTORY 45 46 * Searches through the index 46 47 * … … 55 56 private String InputToken = "lung disease"; 56 57 // 57 static final String LUCENE_ONTOINDEX_DIRECTORY = "/Users/jorislops/Documents/workspace/Dasha/biobank_search"; 58 static final String ONTOLOGIES_DIRECTORY = "/Users/jorislops/Documents/workspace/Dasha/biobank_search/ontologies/"; 58 static final String LUCENE_ONTOINDEX_DIRECTORY = "/Users/despoina/Documents/biobank_search_index/"; 59 static final String ONTOLOGIES_DIRECTORY = "/Users/despoina/Documents/workspace/biobank_search/ontologies"; 60 61 59 62 60 63 public static final Map<String , String> ontologyNamesMap = new HashMap<String, String>() {{ … … 170 173 collector = TopScoreDocCollector.create(1000, true); 171 174 172 //making a boolean query to specify in which ontologies to search 175 /** 176 * making a boolean query to specify in which ontologies to search 177 */ 173 178 BooleanQuery labelQuery = new BooleanQuery(); 174 179 BooleanQuery finalQuery = new BooleanQuery(); … … 179 184 } 180 185 181 //the query to search the term in the field "term" 186 /** 187 * the query to search the term in the field "term" 188 */ 182 189 query2 = new TermQuery(new Term("term", query)); 183 190 184 //merging 2 queries together 191 /** 192 * merging 2 queries together 193 */ 185 194 finalQuery.add(query2, BooleanClause.Occur.MUST); 186 195 finalQuery.add(labelQuery, BooleanClause.Occur.MUST); … … 246 255 try { 247 256 System.out.println("Start Indexing Ontocat results") ; 248 this.setStatus("Starting indexing Ontocat results in " + 249 LUCENE_ONTOINDEX_DIRECTORY); 257 this.setStatus("Starting indexing Ontocat results in " + LUCENE_ONTOINDEX_DIRECTORY); 250 258 251 259 file = new File(LUCENE_ONTOINDEX_DIRECTORY); 252 260 analyzer = new StandardAnalyzer(Version.LUCENE_30); 253 writer = new IndexWriter(FSDirectory.open(file), analyzer, true, 254 IndexWriter.MaxFieldLength.UNLIMITED); 261 writer = new IndexWriter(FSDirectory.open(file), analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); 255 262 256 for (String ontology_file: ontologyNamesMap.keySet()){263 for (String ontology_file: ontologyNamesMap.keySet()){ 257 264 say("now " + writer.getReader().numDocs() + " terms indexed"); 258 265 … … 267 274 for(OntologyTerm term: all_terms) 268 275 { 269 //getting the term with ontology label inside the index 270 //for each term we use a separate Document 276 /** 277 * getting the term with ontology label inside the index 278 * for each term we use a separate Document 279 */ 280 271 281 Document document = new Document(); 272 282 Field termField = new Field("term", term.getLabel().toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED); … … 277 287 document.add(ontologyLabelField); 278 288 279 //searching for synonyms and children in ontology, writing them to "expansion" with delimiters ";" 289 /** 290 * searching for synonyms and children in ontology, writing them to "expansion" with delimiters ";" 291 */ 280 292 List<OntologyTerm> children = new ArrayList<OntologyTerm>(); 281 293 List<String> syns = new ArrayList <String>(); … … 285 297 //System.out.println("syns:\n" + syns); 286 298 for (String s : syns){ 287 if (term.getLabel().toLowerCase() != s){ 299 if (term.getLabel().toLowerCase() != s){ //if it doesn't already exists 288 300 s = "\"" + s.toLowerCase() + "\""; 289 301 //System.out.println("syns: " + s); … … 309 321 310 322 } 311 //optimize the index 323 /** 324 * optimize the index 325 */ 312 326 System.out.println(": Optimizing Index :" ); 313 327 this.setStatus("Optimizing Ontocat Index" ); -
biobank_search/handwritten/java/plugins/OntocatQueryExpansion_lucene.java
r3413 r3481 19 19 import uk.ac.ebi.ontocat.virtual.CompositeDecorator; 20 20 21 /* 21 /** 22 22 * Expands the query by adding synonyms and children to initial query, using Boolean OR (not necessary, but more convenient to look through the query), expansion terms are weighted less than initial query terms 23 23 * @param query_terms - chunks: all possible combinations of query terms(subsequent) … … 58 58 } 59 59 60 //chunking the query into strings of different length 60 /** 61 * Chunking the query into strings of different length 62 */ 61 63 public List<String> chunk (List<String> words){ 62 64 String q = ""; … … 79 81 } 80 82 81 // generating a list of possible word combinations of different length from 82 // the query and setting it as query_terms 83 /** 84 * generating a list of possible word combinations of different length from 85 * the query and setting it as query_terms 86 */ 83 87 public List<String> parseQuery(String query) 84 88 { … … 90 94 91 95 92 String ignore = "[,.\\:\\!\\?;]"; //punctuation to be ignored93 query = query.replaceAll(ignore, " ").trim(); / /remove punctuation94 95 / /replacing special symbols96 String ignore = "[,.\\:\\!\\?;]"; /** punctuation to be ignored*/ 97 query = query.replaceAll(ignore, " ").trim(); /** remove punctuation */ 98 99 /** replacing special symbols */ 96 100 query = query.replaceAll("( *OR)|(OR *)", "|"); 97 101 query = query.replaceAll(" *AND *", "&"); 98 102 query = query.toLowerCase(); 99 103 int len = query.length(); 100 // splitting by ' ' and by spec_symbols, leaving phrases in "" as single unit phrases104 //* splitting by ' ' and by spec_symbols, leaving phrases in "" as single unit phrases */ 101 105 while(i < len){ 102 106 … … 154 158 155 159 } 156 //setting the initial query list (with Boolean operators) 160 161 /** 162 * setting the initial query list (with Boolean operators) 163 */ 157 164 setInit_query(words); 158 165 … … 160 167 List<String> tmp = new ArrayList<String>(); 161 168 162 //chunking the query into all possible n-grams, skipping the Boolean operators 169 /** 170 * chunking the query into all possible n-grams, skipping the Boolean operators 171 */ 163 172 int size = words.size(); 164 173 for (int x = 0; x < size; x++){ … … 178 187 } 179 188 180 //convert String[] to List<String>, removing repeated elements 189 /** 190 * convert String[] to List<String>, removing repeated elements 191 * @param arr 192 * @return 193 */ 181 194 public List<String> array2listNotDuplicate (String[] arr){ 182 195 List<String> list = new ArrayList<String>(); … … 189 202 } 190 203 191 //query expansion. Changes init_query by adding expansion terms after found terms and joining found phrase terms 204 /** 205 * Query expansion. Changes init_query by adding expansion terms after found terms and joining found phrase terms 206 * hrase - current part of the query 207 * found_terms - terms found in ontologies 208 * searcher.SearchIndexOntocat(String phrase, List<Strings> ontologies) searches the phrase in index files of ontologies, returns (term:syn1;syn2;...;child1;child2;...) 209 * expansion - synonyms + children 210 * found all - found_terms + expansion 211 * @param ontologiesToUse 212 */ 192 213 public void expand(List<String> ontologiesToUse) { 193 214 /* … … 209 230 OntoCatIndexPlugin2 searcher = new OntoCatIndexPlugin2("x",null); 210 231 211 //searching the phrase in ontologies 232 /** 233 * searching the phrase in ontologies 234 */ 212 235 List<String> found_terms = new ArrayList<String>(); 213 236 for( String str : searcher.SearchIndexOntocat(phrase, ontologiesToUse).split(":")) … … 223 246 List<String> expansion = new ArrayList<String>(); 224 247 225 / /adding the phrase and expansion terms248 /** adding the phrase and expansion terms */ 226 249 found_all.add(phrase.toLowerCase()); 227 250 if (!found_all.contains(found_terms.get(0))) … … 237 260 238 261 239 / /replacing the words in init_query, corresponding to the phrase, with the expanded phrase (found_all)262 /** replacing the words in init_query, corresponding to the phrase, with the expanded phrase (found_all) */ 240 263 String[] spl = phrase.split(" "); 241 264 String first_word = spl[0]; … … 271 294 272 295 273 // replacing phrases, containing words from the found phrase, from query terms (to avoid duplicate expansions, to reduce the time spent on searching) 296 /** 297 * replacing phrases, containing words from the found phrase, from query terms 298 * (to avoid duplicate expansions, to reduce the time spent on searching) 299 */ 274 300 z = 0; 275 301 List<String> new_query_terms = new ArrayList<String>(); … … 294 320 } 295 321 296 //constructing the expanded query 322 /** 323 * constructing the expanded query 324 * @param parsed 325 * @return 326 */ 297 327 //TODO: do it with the help of Lucene. OR isn't necessary, ' ' = OR 298 328 public String output(List<String> parsed) { … … 306 336 String next = init_query.get(i + 1); 307 337 if ((parsed.contains(s)) || (spec_symbols.contains(s))) { 308 // to avoid having stopwords in "" 338 /** 339 * to avoid having stopwords in "" 340 */ 309 341 if (s.split(" ").length == 1) 310 342 res_query += s + " "; -
biobank_search/handwritten/java/plugins/OntologyIndexerAndSearcher.java
r3413 r3481 42 42 public class OntologyIndexerAndSearcher { 43 43 44 static final String INDEX_DIRECTORY = "C:/Dasha/studies/programming/eclipse/GEO_ontocat_index"; 45 static final String ONTOLOGIES_DIRECTORY = "C:/Dasha/ontologies/"; 44 //static final String INDEX_DIRECTORY = "C:/Dasha/studies/programming/eclipse/GEO_ontocat_index"; 45 static final String INDEX_DIRECTORY = "/Users/despoina/Documents/dashaworkspace/GEO_ontocat_index"; 46 //static final String ONTOLOGIES_DIRECTORY = "C:/Dasha/ontologies/"; 47 static final String ONTOLOGIES_DIRECTORY = "/Users/despoina/Documents/workspace/biobank_search/ontologies"; 46 48 47 49 OntologyService os; … … 61 63 62 64 public void buildIndexOntocat() throws Exception { 63 try {65 try { 64 66 65 67 /**An IndexWriter creates and maintains an index.
Note: See TracChangeset
for help on using the changeset viewer.