PTF-SimCM: A Simple Contrastive Model with Polysemous Text Fusion for Visual Similarity Metric

<table class="table-group" id="tab1"><tr><td><table class="table"><tr><td class="thead-hr" colspan="2"><hr/></td></tr><tr class="thead"><td class="align_left">Symbol</td><td class="align_center">Description</td></tr><tr><td class="thead-hr" colspan="2"><hr/></td></tr><tr><td class="align_left"><span style="width: 7.39387ptpx;"><svg height="6.1673pt" id="M23" style="vertical-align:-0.2063904pt" version="1.1" viewbox="-0.0498162 -5.96091 7.39387 6.1673" width="7.39387pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M536 404C536 423 520 448 491 448C445 448 398 404 308 283L286 338C255 416 242 448 217 448C182 448 138 402 92 341L111 321C149 368 169 378 178 378C188 378 198 363 214 320L254 212C185 117 138 65 107 65C98 65 85 69 82 75C79 82 73 86 65 86C44 86 23 60 23 39C23 7 44 -12 71 -12C119 -12 168 33 265 177L306 61C321 17 347 -12 373 -12C413 -12 465 33 507 96L491 119C459 84 432 60 413 60C395 60 378 92 358 148L321 250C341 279 369 310 389 332C417 363 439 382 456 382C466 382 475 376 481 368C486 361 492 358 496 358C513 358 536 381 536 404Z"></path></g></svg></span></td><td class="align_center">Input image</td></tr><tr><td class="align_left"><span style="width: 5.39742ptpx;"><svg height="6.1673pt" id="M24" style="vertical-align:-0.2063904pt" version="1.1" viewbox="-0.0498162 -5.96091 5.39742 6.1673" width="5.39742pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M383 397C383 424 344 448 299 448C244 448 172 409 132 375C66 319 23 227 23 146C23 42 74 -12 146 -12C208 -12 298 30 359 103L343 124C315 95 248 48 192 48C145 48 111 85 111 163C111 228 129 294 151 330C171 363 201 401 241 401C275 401 302 384 325 356C332 347 339 344 348 348C373 360 383 381 383 397Z"></path></g></svg></span></td><td class="align_center">Textual description of image</td></tr><tr><td class="align_left"><span style="width: 17.9486ptpx;"><svg height="13.0126pt" id="M25" style="vertical-align:-1.5765pt" version="1.1" viewbox="-0.0498162 -11.4361 17.9486 13.0126" width="17.9486pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M324 430H196L233 583L223 592L145 529L120 430H54L29 396L31 388H111L56 126C33 15 54 -12 77 -12C137 -12 214 57 250 95L233 119C208 92 155 59 138 59C126 59 120 70 131 125L186 390L298 394L324 430Z"></path></g><g transform="matrix(.013,0,0,-0.013,4.433,0)"><path d="M95 130C70 130 46 113 46 88C46 72 54 64 59 64C93 55 121 33 121 -3C121 -41 93 -68 44 -88L55 -117C117 -98 186 -56 186 22C186 91 131 130 95 130Z"></path></g><g transform="matrix(.013,0,0,-0.013,9.576,0)"><path d="M324 430H196L233 583L223 592L145 529L120 430H54L29 396L31 388H111L56 126C33 15 54 -12 77 -12C137 -12 214 57 250 95L233 119C208 92 155 59 138 59C126 59 120 70 131 125L186 390L298 394L324 430Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,14.01,-5.741)"><path d="M310 541L304 571C290 586 211 619 185 610L80 76L131 52L310 541Z"></path></g></svg></span></td><td class="align_center">Image augmentation</td></tr><tr><td class="align_left"><span style="width: 11.9658ptpx;"><svg height="12.7178pt" id="M26" style="vertical-align:-3.42947pt" version="1.1" viewbox="-0.0498162 -9.28833 11.9658 12.7178" width="11.9658pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M619 670C619 686 593 712 555 712S459 686 410 634S335 504 320 430H250L219 400L222 388H312L258 73C223 -133 201 -166 187 -180C175 -191 158 -199 140 -199C123 -199 88 -188 74 -172C68 -166 63 -164 54 -171C38 -185 23 -201 23 -215C23 -236 60 -261 93 -261C122 -261 161 -247 207 -200C268 -138 300 -71 337 94C365 220 376 277 394 387L501 399L521 430H401C432 623 464 665 501 665C524 665 544 651 567 627C577 617 583 618 592 625C601 631 619 651 619 670Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,6.721,3.132)"><path d="M482 498C482 606 448 710 328 710C138 710 24 416 24 218C24 99 58 -12 184 -12C373 -12 482 294 482 498ZM395 521C395 483 391 449 383 395H128C158 537 226 671 312 671C388 671 395 566 395 521ZM375 342C348 190 284 25 192 25C128 25 109 116 109 199C109 243 112 292 121 342H375Z"></path></g></svg></span></td><td class="align_center">Image encoder</td></tr><tr><td class="align_left"><span style="width: 11.974ptpx;"><svg height="9.25986pt" id="M27" style="vertical-align:-0.2455397pt" version="1.1" viewbox="-0.0498162 -9.01432 11.974 9.25986" width="11.974pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M856 669C824 657 796 643 772 628C709 667 631 687 551 687C314 687 158 548 158 412C158 332 216 282 300 282C445 282 516 426 527 561H507C478 406 414 312 302 312C245 312 199 352 199 408C199 535 345 657 550 657C617 657 687 644 742 609C644 538 583 430 495 271C383 71 317 16 193 16C122 16 73 60 73 95H75C82 84 97 83 105 83C133 83 148 106 148 131C148 160 125 179 97 179C64 179 38 151 38 108C38 42 104 -15 211 -15C368 -15 496 71 610 302C672 426 704 522 770 588C803 559 825 518 825 465C825 382 787 316 731 316C710 316 697 328 697 345C697 358 700 380 733 399L723 417C688 398 665 367 665 338C665 303 688 272 737 272C821 272 886 359 886 447C886 516 850 572 797 612C816 627 836 639 863 650L856 669Z"></path></g></svg></span></td><td class="align_center">Cross-modal encoder</td></tr><tr><td class="align_left"><span style="width: 9.95144ptpx;"><svg height="8.68572pt" id="M28" style="vertical-align:-0.0498209pt" version="1.1" viewbox="-0.0498162 -8.6359 9.95144 8.68572" width="9.95144pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M743 650H503L496 622L527 618C563 613 564 603 532 573C449 495 371 431 323 392C301 374 272 355 246 346L280 522C297 609 300 614 379 622L385 650H135L129 622C209 614 215 609 198 522L124 133C106 39 99 35 23 28L17 0H271L277 28C193 35 192 39 208 133L239 316C264 328 280 325 303 288C368 183 435 90 502 0H652L659 28C602 34 584 43 543 94C495 154 403 283 347 369L574 554C634 603 659 612 735 624L743 650Z"></path></g></svg></span></td><td class="align_center">The number of cross-modal embeddings</td></tr><tr><td class="align_left"><span style="width: 12.3181ptpx;"><svg height="9.39034pt" id="M29" style="vertical-align:-3.42943pt" version="1.1" viewbox="-0.0498162 -5.96091 12.3181 9.39034" width="12.3181pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M546 430L539 434C529 434 505 438 495 440C473 444 450 448 430 448C352 448 265 412 213 366C145 306 96 203 96 103C96 22 135 -12 160 -12C190 -12 238 14 262 32C310 68 368 120 411 184H413C403 117 396 75 384 21C353 -118 325 -158 291 -184C270 -200 241 -205 208 -205C133 -205 90 -164 74 -110C70 -98 58 -100 49 -107C34 -119 23 -140 23 -155C23 -190 74 -261 166 -261C219 -261 280 -233 314 -208C383 -157 446 -79 470 81C491 223 529 388 546 430ZM456 386C452 357 433 283 420 252C402 216 366 174 325 129C288 88 239 56 212 56C192 56 182 77 182 120C182 165 199 242 226 292C256 348 281 377 311 389C327 395 353 402 375 402C408 402 436 394 456 386Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,7.072,3.132)"><path d="M482 498C482 606 448 710 328 710C138 710 24 416 24 218C24 99 58 -12 184 -12C373 -12 482 294 482 498ZM395 521C395 483 391 449 383 395H128C158 537 226 671 312 671C388 671 395 566 395 521ZM375 342C348 190 284 25 192 25C128 25 109 116 109 199C109 243 112 292 121 342H375Z"></path></g></svg></span></td><td class="align_center">Multimodal projector module</td></tr><tr><td class="align_left"><span style="width: 11.5613ptpx;"><svg height="9.39034pt" id="M30" style="vertical-align:-3.42943pt" version="1.1" viewbox="-0.0498162 -5.96091 11.5613 9.39034" width="11.5613pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M474 429L457 433C435 440 389 448 367 448C348 448 323 446 309 443C266 434 195 406 148 366C78 307 23 210 23 101C23 35 55 -12 92 -12C118 -12 146 1 196 35C247 70 311 130 346 173H348L281 -148C268 -211 256 -221 208 -229L191 -232L187 -257L433 -245L437 -219L411 -216C357 -210 350 -205 362 -140L427 207C447 315 461 381 474 429ZM387 387C379 337 363 262 355 236C318 180 201 57 142 57C126 57 112 81 112 128C112 205 150 321 220 376C244 395 280 403 312 403C345 403 370 396 387 387Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,6.318,3.132)"><path d="M482 498C482 606 448 710 328 710C138 710 24 416 24 218C24 99 58 -12 184 -12C373 -12 482 294 482 498ZM395 521C395 483 391 449 383 395H128C158 537 226 671 312 671C388 671 395 566 395 521ZM375 342C348 190 284 25 192 25C128 25 109 116 109 199C109 243 112 292 121 342H375Z"></path></g></svg></span></td><td class="align_center">Predictor module</td></tr><tr><td class="align_left"><span style="width: 7.11985ptpx;"><svg height="6.1673pt" id="M31" style="vertical-align:-0.2063904pt" version="1.1" viewbox="-0.0498162 -5.96091 7.11985 6.1673" width="7.11985pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M515 96L502 119C471 88 431 62 423 62C416 62 411 70 417 101C440 223 469 341 497 448H486L412 422L380 277C330 188 210 57 152 57C137 57 126 69 139 124L195 366C210 431 205 448 182 448C155 448 89 413 23 350L36 326C73 354 103 376 112 376C118 376 118 365 113 340L61 118C54 90 52 68 52 51C52 0 75 -12 98 -12S151 -3 181 17C242 58 305 119 362 193H364L345 104C323 3 339 -12 359 -12C390 -12 464 35 515 96Z"></path></g></svg></span></td><td class="align_center">Fused feature</td></tr><tr><td class="align_left"><span style="width: 6.59789ptpx;"><svg height="6.44133pt" id="M32" style="vertical-align:-0.29774pt" version="1.1" viewbox="-0.0498162 -6.14359 6.59789 6.44133" width="6.59789pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M475 445L452 468C423 436 401 426 385 426S352 427 305 436C268 443 216 448 185 448C156 448 128 425 102 382C89 360 77 338 64 304L89 291C117 340 143 380 190 380C214 380 268 376 295 373C320 370 344 371 362 374L70 101C32 65 23 36 23 18C23 5 27 -3 31 -8C36 -8 39 -6 42 -1C52 15 67 31 80 40S110 49 143 33C195 8 257 -20 290 -20C339 -20 382 11 439 129L416 148C366 74 334 55 305 55C278 55 236 68 203 81C169 94 141 100 122 96C205 170 348 308 411 376L475 445Z"></path></g></svg></span></td><td class="align_center">Metric embedding</td></tr><tr><td class="align_left"><span style="width: 7.83752ptpx;"><svg height="10.2124pt" id="M33" style="vertical-align:-3.42943pt" version="1.1" viewbox="-0.0498162 -6.78297 7.83752 10.2124" width="7.83752pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M570 304C570 398 525 448 414 448C385 448 343 445 312 434L329 511L321 518C297 504 262 482 244 460L233 411C195 397 159 381 128 358L135 332C160 347 189 360 224 373L111 -147C97 -210 84 -218 17 -231L13 -257L254 -247L259 -218L233 -216C183 -212 177 -202 189 -142L218 -1C238 -10 266 -12 283 -12C351 3 429 48 483 105C543 168 570 242 570 304ZM482 289C482 161 380 33 304 33C278 33 248 51 233 69L303 396C326 400 352 403 369 403C428 403 482 380 482 289Z"></path></g></svg></span></td><td class="align_center">Vector output from predictor</td></tr><tr><td class="align_left"><span style="width: 11.7652ptpx;"><svg height="9.39034pt" id="M34" style="vertical-align:-3.42943pt" version="1.1" viewbox="-0.0498162 -5.96091 11.7652 9.39034" width="11.7652pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M352 391C352 416 319 448 267 448C236 448 173 423 147 400C107 364 96 332 96 304C96 248 143 210 193 181C241 153 258 124 258 100C258 72 232 38 184 38C151 38 107 66 81 108C77 114 64 116 55 111C34 99 23 84 23 65C23 29 81 -12 134 -12C220 -12 325 61 325 141C325 184 297 215 234 256C194 282 161 309 161 346C161 380 188 401 217 401C255 401 279 380 301 353C308 344 313 341 325 347C341 355 352 371 352 391Z"></path></g><g transform="matrix(.013,0,0,-0.013,4.223,0)"><path d="M546 430L539 434C529 434 505 438 495 440C473 444 450 448 430 448C352 448 265 412 213 366C145 306 96 203 96 103C96 22 135 -12 160 -12C190 -12 238 14 262 32C310 68 368 120 411 184H413C403 117 396 75 384 21C353 -118 325 -158 291 -184C270 -200 241 -205 208 -205C133 -205 90 -164 74 -110C70 -98 58 -100 49 -107C34 -119 23 -140 23 -155C23 -190 74 -261 166 -261C219 -261 280 -233 314 -208C383 -157 446 -79 470 81C491 223 529 388 546 430ZM456 386C452 357 433 283 420 252C402 216 366 174 325 129C288 88 239 56 212 56C192 56 182 77 182 120C182 165 199 242 226 292C256 348 281 377 311 389C327 395 353 402 375 402C408 402 436 394 456 386Z"></path></g></svg></span></td><td class="align_center">Stop-gradient operation</td></tr><tr class="table-tr"><td colspan="2"><hr class="tbody-hr"/></td></tr></table></td></tr></table>

<div>Notation list of the proposed method.</div>

Complexity

tab1

Table 1

Table 1: PTF-SimCM: A Simple Contrastive Model with Polysemous Text Fusion for Visual Similarity Metric