AI-Generated SVG View Details - <svg width="850" height="620" viewBox="0 0 850 620" xmlns="http://www.w3.org/2000/svg"> <defs> <marker id="arrowhead" markerWidth="10" markerHeight="7" refX="9" refY="3.5" orient="auto"> <polygon points="0 0, 10 3.5, 0 7" fill="#000" /> </marker> <style type="text/css"> .label-text { font-family: Helvetica, Arial, sans-serif; font-size: 14px; fill: #333; text-anchor: middle; } .title-text { font-family: Helvetica, Arial, sans-serif; font-size: 16px; font-weight: bold; fill: #000; text-anchor: middle; } .box-stroke { stroke: #666; stroke-width: 2; fill: none; } .dashed-box { stroke: #666; stroke-width: 2; stroke-dasharray: 5,5; fill: none; } .feature-rect { width: 30px; height: 30px; stroke: #fff; stroke-width: 1; } .attn-box { width: 260px; height: 40px; rx: 5; ry: 5; } .connector { stroke: #000; stroke-width: 2; fill: none; marker-end: url(#arrowhead); } .line-connector { stroke: #000; stroke-width: 2; } </style> </defs> <rect x="30" y="195" width="285" height="300" rx="15" ry="15" fill="#dae8fc" stroke="#b6d4fe" stroke-width="2"/> <rect x="360" y="115" width="330" height="445" rx="15" ry="15" fill="#fff2cc" stroke="#d6b656" stroke-width="2"/> <text x="180" y="115" class="title-text">Image Feature</text> <g transform="translate(65, 125)"> <rect x="0" y="0" width="230" height="50" class="dashed-box"/> <g transform="translate(15, 10)"> <rect x="0" y="0" class="feature-rect" fill="#d5e8d4" stroke="#82b366"/> <rect x="40" y="0" class="feature-rect" fill="#d5e8d4" stroke="#82b366"/> <rect x="80" y="0" class="feature-rect" fill="#d5e8d4" stroke="#82b366"/> <rect x="120" y="0" class="feature-rect" fill="#d5e8d4" stroke="#82b366"/> <rect x="160" y="0" class="feature-rect" fill="#d5e8d4" stroke="#82b366"/> </g> </g> <rect x="395" y="130" class="attn-box" fill="#d5e8d4" stroke="#82b366"/> <text x="525" y="155" class="label-text" font-weight="bold">2D Visual Cross-Attn</text> <line x1="295" y1="150" x2="395" y2="150" class="connector"/> <text x="345" y="145" class="label-text" font-size="12px">K,V</text> <text x="180" y="215" class="title-text">Text Feature</text> <g transform="translate(65, 225)"> <rect x="0" y="0" width="230" height="50" class="dashed-box" fill="#dae8fc"/> <g transform="translate(15, 10)"> <rect x="0" y="0" class="feature-rect" fill="#f8cecc" stroke="#b85450"/> <rect x="40" y="0" class="feature-rect" fill="#f8cecc" stroke="#b85450"/> <rect x="80" y="0" class="feature-rect" fill="#f8cecc" stroke="#b85450"/> <rect x="120" y="0" class="feature-rect" fill="#f8cecc" stroke="#b85450"/> <rect x="160" y="0" class="feature-rect" fill="#f8cecc" stroke="#b85450"/> </g> </g> <rect x="395" y="230" class="attn-box" fill="#f8cecc" stroke="#b85450"/> <text x="525" y="255" class="label-text" font-weight="bold">Text Cross-Attn</text> <line x1="295" y1="250" x2="395" y2="250" class="connector"/> <text x="345" y="245" class="label-text" font-size="12px">K,V</text> <text x="180" y="315" class="title-text">Point-Image Feature</text> <g transform="translate(65, 325)"> <rect x="0" y="0" width="230" height="50" class="dashed-box" fill="#dae8fc"/> <g transform="translate(15, 10)"> <rect x="0" y="0" class="feature-rect" fill="#dae8fc" stroke="#6c8ebf"/> <rect x="40" y="0" class="feature-rect" fill="#1ba1e2" stroke="#006eaf"/> <rect x="80" y="0" class="feature-rect" fill="#1ba1e2" stroke="#006eaf"/> <rect x="120" y="0" class="feature-rect" fill="#1ba1e2" stroke="#006eaf"/> <rect x="160" y="0" class="feature-rect" fill="#dae8fc" stroke="#6c8ebf"/> </g> </g> <rect x="395" y="330" class="attn-box" fill="#dae8fc" stroke="#6c8ebf"/> <text x="525" y="355" class="label-text" font-weight="bold">3D Visual Cross-Attn</text> <line x1="295" y1="350" x2="395" y2="350" class="connector"/> <text x="345" y="345" class="label-text" font-size="12px">K,V</text> <rect x="100" y="425" width="160" height="50" rx="10" ry="10" fill="#f5f5f5" stroke="#666" stroke-width="1"/> <text x="180" y="445" class="title-text" font-size="14px">Contrastive</text> <text x="180" y="465" class="title-text" font-size="14px">Embed</text> <rect x="395" y="430" class="attn-box" fill="#e1d5e7" stroke="#9673a6"/> <text x="525" y="455" class="label-text" font-weight="bold">Self-Attn</text> <line x1="260" y1="450" x2="395" y2="450" class="connector"/> <text x="345" y="445" class="label-text" font-size="12px">TopK</text> <line x1="180" y1="375" x2="180" y2="425" class="connector"/> <polyline points="65,250 45,250 45,450 100,450" class="connector"/> <g transform="translate(420, 500)"> <rect x="0" y="0" width="150" height="40" class="dashed-box"/> <text x="165" y="25" class="label-text" font-weight="bold">Query</text> <g transform="translate(5, 5)"> <rect x="0" y="0" class="feature-rect" fill="#e1d5e7"/> <rect x="35" y="0" class="feature-rect" fill="#e1d5e7"/> <rect x="70" y="0" class="feature-rect" fill="#e1d5e7"/> <rect x="105" y="0" class="feature-rect" fill="#e1d5e7"/> </g> </g> <line x1="525" y1="500" x2="525" y2="470" class="connector"/> <line x1="525" y1="430" x2="525" y2="370" class="connector"/> <text x="540" y="400" class="label-text">Q</text> <line x1="525" y1="330" x2="525" y2="270" class="connector"/> <text x="540" y="300" class="label-text">Q</text> <line x1="525" y1="230" x2="525" y2="170" class="connector"/> <text x="540" y="200" class="label-text">Q</text> <line x1="525" y1="130" x2="525" y2="80" class="connector"/> <g transform="translate(420, 20)"> <rect x="0" y="0" width="210" height="40" class="dashed-box"/> <g transform="translate(25, 5)"> <rect x="0" y="0" class="feature-rect" fill="#d5e8d4"/> <rect x="40" y="0" class="feature-rect" fill="#dae8fc"/> <rect x="80" y="0" class="feature-rect" fill="#f8cecc"/> <rect x="120" y="0" class="feature-rect" fill="#e1d5e7"/> </g> </g> <text x="175" y="525" class="title-text" font-size="18px">3D visual-Text Fusion</text> <text x="525" y="585" class="title-text" font-size="18px">Multi-modality Decoder</text> </svg> High-Quality Vector Graphics | SVGX