1. Create a schema.xml under a $SOLR_HOME/example/solr/addressDB/conf directory. From the data files, I will use 19 fields to make one document. These fields should be defined as a <field> element in this file. This is the schema.xml I will use, and it is in this repository. This is very simple version of a schema file for the simple example. To take an advantage of Solr's search capabilities, we need to update this file on a future post.
When a field type is a string_ci, any space in the field value will be removed during an index creation and a search query run.
When a field type is a string_ng, any space and a comma in the field value will be removed during an index creation and a search query run. In addition to this, an EdegNGram filter will be applied during the index creation. Let's say maxGamSize=3 and a field value is 'my test', for example. In this case, this field value will have three indexes associated with it: 'm', 'my', and 'myt' (after eliminating space by the specified 'solr.PatternReplaceFilteerFactory'.) What it means that the field value 'my test' will be one of return values when a search query passes a 'my' (without quote).
2. On my previous post, the Solr server showed an error because of non-existence of the schema.xml file. Now, you should see the Solr server with a core named addressDB.
Now, I will write a client application using SolrJ and populate documents on the Solr.
3. Download address dataset for our example. See a previous post for the dataset.
4. Posting data to the Solr server from a Java client using SolrJ is simple. First of all, this is a pom.xml for the client project
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.jihwan.learn.solr</groupId>
<artifactId>address-db-client</artifactId>
<version>1.0</version>
<dependencies>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
<version>4.10.2</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.10</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
<version>1.7.6</version>
</dependency>
</dependencies>
</project>
5. This is the first version of a basic format of a Client process, and this format is shown on several tutorial pages.
SolrServer server = new HttpSolrServer("HOST_URL/solr/addressDB");
for(int i=1; i<=10000; i++){
SolrInputDocument doc = new SolrInputDocument();
doc.addField(.....);
doc.addField(.....);
server.add(doc);
if(i%200 == 0){
server.commit();
}
}
(** On version 5+, the HttpSolrServer object is deprecated. Need to use a HttpSolrClient object instead)
Unfortunately, this is extremely slow to load large data. On my MacBook Pro, only about 263,000 address data was processed in 10 minutes.
On my next post, I will show the next version of the client code after slight modification.
############################################################################
This is the first version of a Java client code.
package com.jihwan.learn.solr;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.common.SolrInputDocument;
public class AddressClient {
// The directory where the address files located at.
private static final String FILE_DIR = “SOME_DIRECTORY/data/addressDB/";
public static void main(String[] args) throws FileNotFoundException {
SolrServer server = new HttpSolrServer("http://localhost:8983/solr/addressDB");
File folder = new File(FILE_DIR);
File[] listOfFiles = folder.listFiles();
File txtfile = null;
BufferedReader br = null;
try {
long startTime = System.currentTimeMillis();
int dataCount = 0;
for (File file : listOfFiles) {
if (file.isFile()) {
if (file.getName().endsWith(".txt")) {
String line;
System.out.println("Read a file " + file.getName());
txtfile = new File(FILE_DIR + file.getName());
br = new BufferedReader(new InputStreamReader(new FileInputStream(txtfile)));
// Dump headers
line = br.readLine(); // English
line = br.readLine(); // Korean
while ((line = br.readLine()) != null) {
SolrInputDocument doc = lineParser(line, ++dataCount);
server.add(doc);
if (dataCount % 1000 == 0) {
server.commit();
}
}
if (dataCount % 1000 != 0) {
server.commit();
}
br.close();
}
}
}
long endTime = System.currentTimeMillis();
System.out.println("Execution Time: " + (endTime-startTime));
}catch (IOException ioE) {
ioE.printStackTrace();
}catch (SolrServerException e) {
e.printStackTrace();
}finally {
try {
br.close();
}catch (IOException e) {
e.printStackTrace();
}
}
}
public static SolrInputDocument lineParser(String line, int id) {
String[] lineTerms = line.split("\\|");
int parseIndex = 0;
SolrInputDocument doc = new SolrInputDocument();
doc.addField("addressId", addressId);
doc.addField("areaCode", nonNullTrim(lineTerms[parseIndex++]));
doc.addField("state", nonNullTrim(lineTerms[parseIndex++]));
doc.addField("state_en", nonNullTrim(lineTerms[parseIndex++]));
doc.addField("city", nonNullTrim(lineTerms[parseIndex++]));
doc.addField("city_en", nonNullTrim(lineTerms[parseIndex++]));
doc.addField("subCity", nonNullTrim(lineTerms[parseIndex++]));
doc.addField("subCity_en", nonNullTrim(lineTerms[parseIndex++]));
parseIndex++; //Skip street_code
doc.addField("streetName", nonNullTrim(lineTerms[parseIndex++]));
doc.addField("streetName_en", nonNullTrim(lineTerms[parseIndex++]));
parseIndex++; //Skip is_basement
String bldNumber = nonNullTrim(lineTerms[parseIndex++]);
if (!isEmpty(lineTerms[parseIndex])) {
bldNumber = bldNumber + "-" + lineTerms[parseIndex].trim();
}
parseIndex++;
doc.addField("buildingNumber", bldNumber);
parseIndex++; //Skip building_mgm_num
doc.addField("bulkDeliveryPlaceName", nonNullTrim(lineTerms[parseIndex++]));
doc.addField("buildingName", nonNullTrim(lineTerms[parseIndex++]));
parseIndex++; //Skip legal_dong_code
doc.addField("dongName", nonNullTrim(lineTerms[parseIndex++]));
doc.addField("riName", nonNullTrim(lineTerms[parseIndex++]));
doc.addField("adminDongName", nonNullTrim(lineTerms[parseIndex++]));
parseIndex++; // skip is_mountain
String grdNumber = nonNullTrim(lineTerms[parseIndex++]);
doc.addField("dongSeq", nonNullTrim(lineTerms[parseIndex++]));
if (!StringUtil.isEmpty(lineTerms[parseIndex])) {
grdNumber = grdNumber + "-" + lineTerms[parseIndex];
}
parseIndex++;
doc.addField("groundNumber", grdNumber);
doc.addField("postalCode", nonNullTrim(lineTerms[parseIndex++]));
return doc;
}
public static boolean isEmpty(String s) {
return s == null || s.trim().length() < 1;
}
public static String nonNullTrim(String value) {
return (isEmpty(value) ? null : value.trim());
}
}


No comments:
Post a Comment