1: 2: 3: 4: 5: 6: 7: 8: 9: 10: 11: 12: 13: 14: 15: 16: 17: 18: 19: 20: 21: 22: 23: 24: 25: 26: 27: 28: 29: 30: 31: 32: 33: 34: 35: 36: 37: 38: 39: 40: 41: 42: 43: 44: 45: 46: 47: 48: 49: 50: 51: 52: 53: 54: 55: 56: 57: 58: 59: 60: 61: 62: 63: 64: 65: 66: 67: 68: 69: 70: 71: 72: 73: 74: 75: 76: 77: 78: 79: 80: 81: 82: 83: 84: 85: 86: 87: 88: 89: 90: 91: 92: 93: 94: 95: 96: 97: 98: 99: 100: 101: 102: 103: 104: 105: 106: 107: 108: 109: 110: 111: 112: 113: 114: 115: 116: 117: 118: 119: 120: 121: 122: 123: 124: 125: 126: 127: 128: 129: 130: 131: 132: 133: 134: 135: 136: 137: 138: 139: 140: 141: 142: 143: 144: 145: 146: 147: 148: 149: 150: 151: 152: 153: 154: 155: 156: 157: 158: 159: 160: 161: 162: 163: 164: 165: 166: 167: 168: 169: 170: 171: 172: 173: 174: 175: 176: 177: 178: 179: 180: 181: 182: 183: 184: 185: 186: 187: 188: 189: 190: 191: 192: 193: 194: 195: 196: 197: 198: 199: 200: 201: 202: 203: 204: 205: 206: 207: 208: 209: 210: 211: 212: 213: 214: 215: 216: 217: 218: 219: 220: 221: 222: 223: 224: 225: 226: 227: 228: 229: 230: 231: 232: 233: 234: 235: 236: 237: 238: 239: 240: 241: 242: 243: 244: 245: 246: 247: 248: 249: 250: 251: 252: 253: 254: 255: 256: 257: 258: 259: 260: 261: 262: 263: 264: 265: 266: 267: 268: 269: 270: 271: 272: 273: 274: 275: 276: 277: 278: 279: 280: 281: 282: 283: 284: 285: 286: 287: 288: 289: 290: 291: 292: 293: 294: 295: 296: 297: 298: 299: 300: 301: 302: 303: 304: 305: 306: 307: 308: 309: 310: 311: 312: 313: 314: 315: 316: 317: 318: 319: 320: 321: 322: 323: 324: 325: 326: 327: 328: 329: 330: 331: 332: 333: 334: 335: 336: 337: 338: 339: 340: 341: 342: 343: 344: 345: 346: 347: 348: 349: 350: 351: 352: 353: 354: 355: 356: 357: 358: 359: 360: 361: 362: 363: 364: 365: 366: 367: 368: 369: 370: 371: 372: 373: 374: 375: 376: 377: 378: 379: 380: 381: 382: 383: 384: 385: 386: 387: 388: 389: 390: 391: 392: 393: 394: 395: 396: 397: 398: 399: 400: 401: 402: 403: 404: 405: 406: 407: 408: 409: 410: 411: 412: 413: 414: 415: 416: 417: 418: 419: 420: 421: 422: 423: 424: 425: 426: 427: 428: 429: 430: 431: 432: 433: 434: 435: 436: 437: 438: 439: 440: 441: 442: 443: 444: 445: 446: 447: 448: 449: 450: 451: 452: 453: 454: 455: 456: 457: 458: 459: 460: 461: 462: 463: 464: 465: 466: 467: 468: 469: 470: 471: 472: 473: 474: 475: 476: 477: 478: 479: 480: 481: 482: 483: 484: 485: 486: 487: 488: 489: 490: 491: 492: 493: 494: 495: 496: 497: 498: 499: 500: 501: 502: 503: 504: 505: 506: 507: 508: 509: 510: 511: 512: 513: 514: 515: 516: 517: 518: 519: 520: 521: 522: 523: 524: 525: 526: 527: 528: 529: 530: 531: 532: 533: 534: 535: 536: 537: 538: 539: 540: 541: 542: 543: 544: 545: 546: 547: 548: 549: 550: 551: 552: 553: 554: 555: 556: 557: 558: 559: 560: 561: 562: 563: 564: 565: 566: 567: 568: 569: 570: 571: 572: 573: 574: 575: 576: 577: 578: 579: 580: 581: 582: 583: 584: 585: 586: 587: 588: 589: 590: 591: 592: 593: 594: 595: 596: 597: 598: 599: 600: 601: 602: 603: 604: 605: 606: 607: 608: 609: 610: 611: 612: 613: 614: 615: 616: 617: 618: 619: 620: 621: 622: 623: 624: 625: 626: 627: 628: 629: 630: 631: 632: 633: 634: 635: 636: 637: 638: 639: 640: 641: 642: 643: 644: 645: 646: 647: 648: 649: 650: 651: 652:
<?php
class SaeSegment extends SaeObject
{
private $_errno = SAE_Success;
private $_errmsg = "OK";
private $_errmsgs = array(
-1 => "word segmentation service internal error",
-2 => "parameters (word_tag, encoding) error.",
-3 => "context can not be empty",
-4 => "unknown error",
607 => "service is not enabled",
);
private $_encodings = array('GBK', 'UTF-8', 'UCS-2');
const baseurl = "http://segment.sae.sina.com.cn:81/urlclient.php";
function __construct() {
}
function segment($context, $word_tag = 0, $encoding = 'UTF-8') {
$post = array();
$params = array();
if ( trim( $context ) === '' ) {
$this->_errno = -3;
$this->_errmsg = $this->_errmsgs[-3];
return false;
} else {
$post['context'] = $context;
}
$params['word_tag'] = $word_tag ? 1 : 0;
$encoding = strtoupper(trim($encoding));
if ( !in_array( $encoding, $this->_encodings ) ) {
$params['encoding'] = 'UTF-8';
} else {
$params['encoding'] = $encoding;
}
$ret = $this->postData($post, $params);
if ( $encoding != 'UTF-8' && !empty($ret) ) {
foreach ($ret as $k => $v) {
$v['word'] = mb_convert_encoding( $v['word'], $encoding, 'UTF-8' );
$ret[$k] = $v;
}
}
return $ret;
}
public function errno() {
return $this->_errno;
}
public function errmsg() {
return $this->_errmsg;
}
private function postData($post, $params) {
$url = self::baseurl . '?' . http_build_query( $params );
$s = curl_init();
if (is_array($post)) {
$post = http_build_query($post);
}
curl_setopt($s,CURLOPT_URL,$url);
curl_setopt($s,CURLOPT_HTTP_VERSION,CURL_HTTP_VERSION_1_0);
curl_setopt($s,CURLOPT_TIMEOUT,5);
curl_setopt($s,CURLOPT_RETURNTRANSFER,true);
curl_setopt($s,CURLINFO_HEADER_OUT, true);
curl_setopt($s,CURLOPT_POST,true);
curl_setopt($s,CURLOPT_POSTFIELDS,$post);
$ret = curl_exec($s);
$info = curl_getinfo($s);
curl_close($s);
if(empty($info['http_code'])) {
$this->_errno = -4;
$this->_errmsg = "can not reach word segmentation server";
} else if($info['http_code'] == 607) {
$this->_errno = 607;
$this->_errmsg = $this->_errmsgs[607];
} else if($info['http_code'] != 200) {
$this->_errno = -1;
$this->_errmsg = $this->_errmsgs[-1];
} else {
if($info['size_download'] == 0) {
$this->_errno = SAE_ErrInternal;
$this->_errmsg = "word segmentation service internal error";
} else {
$array = json_decode(trim($ret), true);
if ( count( $array ) === 1 && is_int( $array[0] ) && $array[0] < 0 ) {
$this->_errno = $array[0];
$this->_errmsg = $this->_errmsgs[$array[0]];
return false;
} else {
$this->_errno = SAE_Success;
$this->_errmsg = 'OK';
return $array;
}
}
}
return false;
}
const POSTAG_ID_UNKNOW = 0;
const POSTAG_ID_A = 10;
const POSTAG_ID_B = 20;
const POSTAG_ID_C = 30;
const POSTAG_ID_C_N = 31;
const POSTAG_ID_C_Z = 32;
const POSTAG_ID_D = 40;
const POSTAG_ID_D_B = 41;
const POSTAG_ID_D_M = 42;
const POSTAG_ID_E = 50;
const POSTAG_ID_F = 60;
const POSTAG_ID_F_S = 61;
const POSTAG_ID_F_N = 62;
const POSTAG_ID_F_V = 63;
const POSTAG_ID_F_Z = 64;
const POSTAG_ID_H = 70;
const POSTAG_ID_H_M = 71;
const POSTAG_ID_H_T = 72;
const POSTAG_ID_H_NR = 73;
const POSTAG_ID_H_N = 74;
const POSTAG_ID_K = 80;
const POSTAG_ID_K_M = 81;
const POSTAG_ID_K_T = 82;
const POSTAG_ID_K_N = 83;
const POSTAG_ID_K_S = 84;
const POSTAG_ID_K_Z = 85;
const POSTAG_ID_K_NT = 86;
const POSTAG_ID_K_NS = 87;
const POSTAG_ID_M = 90;
const POSTAG_ID_N = 95;
const POSTAG_ID_N_RZ = 96;
const POSTAG_ID_N_T = 97;
const POSTAG_ID_N_TA = 98;
const POSTAG_ID_N_TZ = 99;
const POSTAG_ID_N_Z = 100;
const POSTAG_ID_NS = 101;
const POSTAG_ID_NS_Z = 102;
const POSTAG_ID_N_M = 103;
const POSTAG_ID_N_RB = 104;
const POSTAG_ID_O = 107;
const POSTAG_ID_P = 108;
const POSTAG_ID_Q = 110;
const POSTAG_ID_Q_V = 111;
const POSTAG_ID_Q_T = 112;
const POSTAG_ID_Q_H = 113;
const POSTAG_ID_R = 120;
const POSTAG_ID_R_D = 121;
const POSTAG_ID_R_M = 122;
const POSTAG_ID_R_N = 123;
const POSTAG_ID_R_S = 124;
const POSTAG_ID_R_T = 125;
const POSTAG_ID_R_Z = 126;
const POSTAG_ID_R_B = 127;
const POSTAG_ID_S = 130;
const POSTAG_ID_S_Z = 131;
const POSTAG_ID_T = 132;
const POSTAG_ID_T_Z = 133;
const POSTAG_ID_U = 140;
const POSTAG_ID_U_N = 141;
const POSTAG_ID_U_D = 142;
const POSTAG_ID_U_C = 143;
const POSTAG_ID_U_Z = 144;
const POSTAG_ID_U_S = 145;
const POSTAG_ID_U_SO = 146;
const POSTAG_ID_W = 150;
const POSTAG_ID_W_D = 151;
const POSTAG_ID_W_SP = 152;
const POSTAG_ID_W_S = 153;
const POSTAG_ID_W_L = 154;
const POSTAG_ID_W_R = 155;
const POSTAG_ID_W_H = 156;
const POSTAG_ID_Y = 160;
const POSTAG_ID_V = 170;
const POSTAG_ID_V_O = 171;
const POSTAG_ID_V_E = 172;
const POSTAG_ID_V_SH = 173;
const POSTAG_ID_V_YO = 174;
const POSTAG_ID_V_Q = 175;
const POSTAG_ID_V_A = 176;
const POSTAG_ID_Z = 180;
const POSTAG_ID_X = 190;
const POSTAG_ID_X_N = 191;
const POSTAG_ID_X_V = 192;
const POSTAG_ID_X_S = 193;
const POSTAG_ID_X_T = 194;
const POSTAG_ID_X_Z = 195;
const POSTAG_ID_X_B = 196;
const POSTAG_ID_SP = 200;
const POSTAG_ID_MQ = 201;
const POSTAG_ID_RQ = 202;
const POSTAG_ID_AD = 210;
const POSTAG_ID_AN = 211;
const POSTAG_ID_VD = 212;
const POSTAG_ID_VN = 213;
const POSTAG_ID_SPACE = 230;
}